includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9\x80-\xff]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * Regular expression to match URIs that could trigger script execution
  61  */
  62 define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' );
  63
  64 /**
  65  * Regular expression to match namespace attributes
  66  */
  67 define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" );
  68
  69 /**
  70  * List of all named character entities defined in HTML 4.01
  71  * http://www.w3.org/TR/html4/sgml/entities.html
  72  * @private
  73  */
  74 global $wgHtmlEntities;
  75 $wgHtmlEntities = array(
  76         'Aacute'   => 193,
  77         'aacute'   => 225,
  78         'Acirc'    => 194,
  79         'acirc'    => 226,
  80         'acute'    => 180,
  81         'AElig'    => 198,
  82         'aelig'    => 230,
  83         'Agrave'   => 192,
  84         'agrave'   => 224,
  85         'alefsym'  => 8501,
  86         'Alpha'    => 913,
  87         'alpha'    => 945,
  88         'amp'      => 38,
  89         'and'      => 8743,
  90         'ang'      => 8736,
  91         'Aring'    => 197,
  92         'aring'    => 229,
  93         'asymp'    => 8776,
  94         'Atilde'   => 195,
  95         'atilde'   => 227,
  96         'Auml'     => 196,
  97         'auml'     => 228,
  98         'bdquo'    => 8222,
  99         'Beta'     => 914,
 100         'beta'     => 946,
 101         'brvbar'   => 166,
 102         'bull'     => 8226,
 103         'cap'      => 8745,
 104         'Ccedil'   => 199,
 105         'ccedil'   => 231,
 106         'cedil'    => 184,
 107         'cent'     => 162,
 108         'Chi'      => 935,
 109         'chi'      => 967,
 110         'circ'     => 710,
 111         'clubs'    => 9827,
 112         'cong'     => 8773,
 113         'copy'     => 169,
 114         'crarr'    => 8629,
 115         'cup'      => 8746,
 116         'curren'   => 164,
 117         'dagger'   => 8224,
 118         'Dagger'   => 8225,
 119         'darr'     => 8595,
 120         'dArr'     => 8659,
 121         'deg'      => 176,
 122         'Delta'    => 916,
 123         'delta'    => 948,
 124         'diams'    => 9830,
 125         'divide'   => 247,
 126         'Eacute'   => 201,
 127         'eacute'   => 233,
 128         'Ecirc'    => 202,
 129         'ecirc'    => 234,
 130         'Egrave'   => 200,
 131         'egrave'   => 232,
 132         'empty'    => 8709,
 133         'emsp'     => 8195,
 134         'ensp'     => 8194,
 135         'Epsilon'  => 917,
 136         'epsilon'  => 949,
 137         'equiv'    => 8801,
 138         'Eta'      => 919,
 139         'eta'      => 951,
 140         'ETH'      => 208,
 141         'eth'      => 240,
 142         'Euml'     => 203,
 143         'euml'     => 235,
 144         'euro'     => 8364,
 145         'exist'    => 8707,
 146         'fnof'     => 402,
 147         'forall'   => 8704,
 148         'frac12'   => 189,
 149         'frac14'   => 188,
 150         'frac34'   => 190,
 151         'frasl'    => 8260,
 152         'Gamma'    => 915,
 153         'gamma'    => 947,
 154         'ge'       => 8805,
 155         'gt'       => 62,
 156         'harr'     => 8596,
 157         'hArr'     => 8660,
 158         'hearts'   => 9829,
 159         'hellip'   => 8230,
 160         'Iacute'   => 205,
 161         'iacute'   => 237,
 162         'Icirc'    => 206,
 163         'icirc'    => 238,
 164         'iexcl'    => 161,
 165         'Igrave'   => 204,
 166         'igrave'   => 236,
 167         'image'    => 8465,
 168         'infin'    => 8734,
 169         'int'      => 8747,
 170         'Iota'     => 921,
 171         'iota'     => 953,
 172         'iquest'   => 191,
 173         'isin'     => 8712,
 174         'Iuml'     => 207,
 175         'iuml'     => 239,
 176         'Kappa'    => 922,
 177         'kappa'    => 954,
 178         'Lambda'   => 923,
 179         'lambda'   => 955,
 180         'lang'     => 9001,
 181         'laquo'    => 171,
 182         'larr'     => 8592,
 183         'lArr'     => 8656,
 184         'lceil'    => 8968,
 185         'ldquo'    => 8220,
 186         'le'       => 8804,
 187         'lfloor'   => 8970,
 188         'lowast'   => 8727,
 189         'loz'      => 9674,
 190         'lrm'      => 8206,
 191         'lsaquo'   => 8249,
 192         'lsquo'    => 8216,
 193         'lt'       => 60,
 194         'macr'     => 175,
 195         'mdash'    => 8212,
 196         'micro'    => 181,
 197         'middot'   => 183,
 198         'minus'    => 8722,
 199         'Mu'       => 924,
 200         'mu'       => 956,
 201         'nabla'    => 8711,
 202         'nbsp'     => 160,
 203         'ndash'    => 8211,
 204         'ne'       => 8800,
 205         'ni'       => 8715,
 206         'not'      => 172,
 207         'notin'    => 8713,
 208         'nsub'     => 8836,
 209         'Ntilde'   => 209,
 210         'ntilde'   => 241,
 211         'Nu'       => 925,
 212         'nu'       => 957,
 213         'Oacute'   => 211,
 214         'oacute'   => 243,
 215         'Ocirc'    => 212,
 216         'ocirc'    => 244,
 217         'OElig'    => 338,
 218         'oelig'    => 339,
 219         'Ograve'   => 210,
 220         'ograve'   => 242,
 221         'oline'    => 8254,
 222         'Omega'    => 937,
 223         'omega'    => 969,
 224         'Omicron'  => 927,
 225         'omicron'  => 959,
 226         'oplus'    => 8853,
 227         'or'       => 8744,
 228         'ordf'     => 170,
 229         'ordm'     => 186,
 230         'Oslash'   => 216,
 231         'oslash'   => 248,
 232         'Otilde'   => 213,
 233         'otilde'   => 245,
 234         'otimes'   => 8855,
 235         'Ouml'     => 214,
 236         'ouml'     => 246,
 237         'para'     => 182,
 238         'part'     => 8706,
 239         'permil'   => 8240,
 240         'perp'     => 8869,
 241         'Phi'      => 934,
 242         'phi'      => 966,
 243         'Pi'       => 928,
 244         'pi'       => 960,
 245         'piv'      => 982,
 246         'plusmn'   => 177,
 247         'pound'    => 163,
 248         'prime'    => 8242,
 249         'Prime'    => 8243,
 250         'prod'     => 8719,
 251         'prop'     => 8733,
 252         'Psi'      => 936,
 253         'psi'      => 968,
 254         'quot'     => 34,
 255         'radic'    => 8730,
 256         'rang'     => 9002,
 257         'raquo'    => 187,
 258         'rarr'     => 8594,
 259         'rArr'     => 8658,
 260         'rceil'    => 8969,
 261         'rdquo'    => 8221,
 262         'real'     => 8476,
 263         'reg'      => 174,
 264         'rfloor'   => 8971,
 265         'Rho'      => 929,
 266         'rho'      => 961,
 267         'rlm'      => 8207,
 268         'rsaquo'   => 8250,
 269         'rsquo'    => 8217,
 270         'sbquo'    => 8218,
 271         'Scaron'   => 352,
 272         'scaron'   => 353,
 273         'sdot'     => 8901,
 274         'sect'     => 167,
 275         'shy'      => 173,
 276         'Sigma'    => 931,
 277         'sigma'    => 963,
 278         'sigmaf'   => 962,
 279         'sim'      => 8764,
 280         'spades'   => 9824,
 281         'sub'      => 8834,
 282         'sube'     => 8838,
 283         'sum'      => 8721,
 284         'sup'      => 8835,
 285         'sup1'     => 185,
 286         'sup2'     => 178,
 287         'sup3'     => 179,
 288         'supe'     => 8839,
 289         'szlig'    => 223,
 290         'Tau'      => 932,
 291         'tau'      => 964,
 292         'there4'   => 8756,
 293         'Theta'    => 920,
 294         'theta'    => 952,
 295         'thetasym' => 977,
 296         'thinsp'   => 8201,
 297         'THORN'    => 222,
 298         'thorn'    => 254,
 299         'tilde'    => 732,
 300         'times'    => 215,
 301         'trade'    => 8482,
 302         'Uacute'   => 218,
 303         'uacute'   => 250,
 304         'uarr'     => 8593,
 305         'uArr'     => 8657,
 306         'Ucirc'    => 219,
 307         'ucirc'    => 251,
 308         'Ugrave'   => 217,
 309         'ugrave'   => 249,
 310         'uml'      => 168,
 311         'upsih'    => 978,
 312         'Upsilon'  => 933,
 313         'upsilon'  => 965,
 314         'Uuml'     => 220,
 315         'uuml'     => 252,
 316         'weierp'   => 8472,
 317         'Xi'       => 926,
 318         'xi'       => 958,
 319         'Yacute'   => 221,
 320         'yacute'   => 253,
 321         'yen'      => 165,
 322         'Yuml'     => 376,
 323         'yuml'     => 255,
 324         'Zeta'     => 918,
 325         'zeta'     => 950,
 326         'zwj'      => 8205,
 327         'zwnj'     => 8204 );
 328
 329 /**
 330  * Character entity aliases accepted by MediaWiki
 331  */
 332 global $wgHtmlEntityAliases;
 333 $wgHtmlEntityAliases = array(
 334         'רלמ' => 'rlm',
 335         'رلم' => 'rlm',
 336 );
 337
 338
 339 /**
 340  * XHTML sanitizer for MediaWiki
 341  * @ingroup Parser
 342  */
 343 class Sanitizer {
 344         /**
 345          * Cleans up HTML, removes dangerous tags and attributes, and
 346          * removes HTML comments
 347          * @private
 348          * @param $text String
 349          * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
 350          * @param $args Array for the processing callback
 351          * @param $extratags Array for any extra tags to include
 352          * @param $removetags Array for any tags (default or extra) to exclude
 353          * @return string
 354          */
 355         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
 356                 global $wgUseTidy;
 357
 358                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 359                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 360
 361                 wfProfileIn( __METHOD__ );
 362
 363                 if ( !$staticInitialised ) {
 364
 365                         $htmlpairsStatic = array( # Tags that must be closed
 366                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 367                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 368                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 369                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 370                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
 371                         );
 372                         $htmlsingle = array(
 373                                 'br', 'hr', 'li', 'dt', 'dd'
 374                         );
 375                         $htmlsingleonly = array( # Elements that cannot have close tags
 376                                 'br', 'hr'
 377                         );
 378                         $htmlnest = array( # Tags that can be nested--??
 379                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 380                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 381                         );
 382                         $tabletags = array( # Can only appear inside table, we will close them
 383                                 'td', 'th', 'tr',
 384                         );
 385                         $htmllist = array( # Tags used by list
 386                                 'ul','ol',
 387                         );
 388                         $listtags = array( # Tags that can appear in a list
 389                                 'li',
 390                         );
 391
 392                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 393                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 394
 395                         # Convert them all to hashtables for faster lookup
 396                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 397                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
 398                         foreach ( $vars as $var ) {
 399                                 $$var = array_flip( $$var );
 400                         }
 401                         $staticInitialised = true;
 402                 }
 403                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
 404                 $extratags = array_flip( $extratags );
 405                 $removetags = array_flip( $removetags );
 406                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
 407                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
 408
 409                 # Remove HTML comments
 410                 $text = Sanitizer::removeHTMLcomments( $text );
 411                 $bits = explode( '<', $text );
 412                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 413                 if ( !$wgUseTidy ) {
 414                         $tagstack = $tablestack = array();
 415                         foreach ( $bits as $x ) {
 416                                 $regs = array();
 417                                 # $slash: Does the current element start with a '/'?
 418                                 # $t: Current element name
 419                                 # $params: String between element name and >
 420                                 # $brace: Ending '>' or '/>'
 421                                 # $rest: Everything until the next element of $bits
 422                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 423                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 424                                 } else {
 425                                         $slash = $t = $params = $brace = $rest = null;
 426                                 }
 427
 428                                 $badtag = false;
 429                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 430                                         # Check our stack
 431                                         if ( $slash && isset( $htmlsingleonly[$t] ) ) {
 432                                                 $badtag = true;
 433                                         } elseif ( $slash ) {
 434                                                 # Closing a tag... is it the one we just opened?
 435                                                 $ot = @array_pop( $tagstack );
 436                                                 if ( $ot != $t ) {
 437                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 438                                                                 # Pop all elements with an optional close tag
 439                                                                 # and see if we find a match below them
 440                                                                 $optstack = array();
 441                                                                 array_push( $optstack, $ot );
 442                                                                 $ot = @array_pop( $tagstack );
 443                                                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
 444                                                                         array_push( $optstack, $ot );
 445                                                                         $ot = @array_pop( $tagstack );
 446                                                                 }
 447                                                                 if ( $t != $ot ) {
 448                                                                         # No match. Push the optional elements back again
 449                                                                         $badtag = true;
 450                                                                         while ( $ot = @array_pop( $optstack ) ) {
 451                                                                                 array_push( $tagstack, $ot );
 452                                                                         }
 453                                                                 }
 454                                                         } else {
 455                                                                 @array_push( $tagstack, $ot );
 456                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 457                                                                 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
 458                                                                         $badtag = true;
 459                                                                 }
 460                                                         }
 461                                                 } else {
 462                                                         if ( $t == 'table' ) {
 463                                                                 $tagstack = array_pop( $tablestack );
 464                                                         }
 465                                                 }
 466                                                 $newparams = '';
 467                                         } else {
 468                                                 # Keep track for later
 469                                                 if ( isset( $tabletags[$t] ) &&
 470                                                 !in_array( 'table', $tagstack ) ) {
 471                                                         $badtag = true;
 472                                                 } elseif ( in_array( $t, $tagstack ) &&
 473                                                 !isset( $htmlnest [$t ] ) ) {
 474                                                         $badtag = true;
 475                                                 # Is it a self closed htmlpair ? (bug 5487)
 476                                                 } elseif ( $brace == '/>' &&
 477                                                 isset( $htmlpairs[$t] ) ) {
 478                                                         $badtag = true;
 479                                                 } elseif ( isset( $htmlsingleonly[$t] ) ) {
 480                                                         # Hack to force empty tag for uncloseable elements
 481                                                         $brace = '/>';
 482                                                 } elseif ( isset( $htmlsingle[$t] ) ) {
 483                                                         # Hack to not close $htmlsingle tags
 484                                                         $brace = null;
 485                                                 } elseif ( isset( $tabletags[$t] )
 486                                                 && in_array( $t, $tagstack ) ) {
 487                                                         // New table tag but forgot to close the previous one
 488                                                         $text .= "</$t>";
 489                                                 } else {
 490                                                         if ( $t == 'table' ) {
 491                                                                 array_push( $tablestack, $tagstack );
 492                                                                 $tagstack = array();
 493                                                         }
 494                                                         array_push( $tagstack, $t );
 495                                                 }
 496
 497                                                 # Replace any variables or template parameters with
 498                                                 # plaintext results.
 499                                                 if( is_callable( $processCallback ) ) {
 500                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 501                                                 }
 502
 503                                                 # Strip non-approved attributes from the tag
 504                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 505                                         }
 506                                         if ( !$badtag ) {
 507                                                 $rest = str_replace( '>', '&gt;', $rest );
 508                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 509                                                 $text .= "<$slash$t$newparams$close>$rest";
 510                                                 continue;
 511                                         }
 512                                 }
 513                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 514                         }
 515                         # Close off any remaining tags
 516                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 517                                 $text .= "</$t>\n";
 518                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 519                         }
 520                 } else {
 521                         # this might be possible using tidy itself
 522                         foreach ( $bits as $x ) {
 523                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 524                                 $x, $regs );
 525                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 526                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 527                                         if( is_callable( $processCallback ) ) {
 528                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 529                                         }
 530                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 531                                         $rest = str_replace( '>', '&gt;', $rest );
 532                                         $text .= "<$slash$t$newparams$brace$rest";
 533                                 } else {
 534                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 535                                 }
 536                         }
 537                 }
 538                 wfProfileOut( __METHOD__ );
 539                 return $text;
 540         }
 541
 542         /**
 543          * Remove '<!--', '-->', and everything between.
 544          * To avoid leaving blank lines, when a comment is both preceded
 545          * and followed by a newline (ignoring spaces), trim leading and
 546          * trailing spaces and one of the newlines.
 547          *
 548          * @private
 549          * @param $text String
 550          * @return string
 551          */
 552         static function removeHTMLcomments( $text ) {
 553                 wfProfileIn( __METHOD__ );
 554                 while (($start = strpos($text, '<!--')) !== false) {
 555                         $end = strpos($text, '-->', $start + 4);
 556                         if ($end === false) {
 557                                 # Unterminated comment; bail out
 558                                 break;
 559                         }
 560
 561                         $end += 3;
 562
 563                         # Trim space and newline if the comment is both
 564                         # preceded and followed by a newline
 565                         $spaceStart = max($start - 1, 0);
 566                         $spaceLen = $end - $spaceStart;
 567                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 568                                 $spaceStart--;
 569                                 $spaceLen++;
 570                         }
 571                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 572                                 $spaceLen++;
 573                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 574                                 # Remove the comment, leading and trailing
 575                                 # spaces, and leave only one newline.
 576                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 577                         }
 578                         else {
 579                                 # Remove just the comment.
 580                                 $text = substr_replace($text, '', $start, $end - $start);
 581                         }
 582                 }
 583                 wfProfileOut( __METHOD__ );
 584                 return $text;
 585         }
 586
 587         /**
 588          * Take an array of attribute names and values and normalize or discard
 589          * illegal values for the given element type.
 590          *
 591          * - Discards attributes not on a whitelist for the given element
 592          * - Unsafe style attributes are discarded
 593          * - Invalid id attributes are reencoded
 594          *
 595          * @param $attribs Array
 596          * @param $element String
 597          * @return Array
 598          *
 599          * @todo Check for legal values where the DTD limits things.
 600          * @todo Check for unique id attribute :P
 601          */
 602         static function validateTagAttributes( $attribs, $element ) {
 603                 return Sanitizer::validateAttributes( $attribs,
 604                         Sanitizer::attributeWhitelist( $element ) );
 605         }
 606
 607         /**
 608          * Take an array of attribute names and values and normalize or discard
 609          * illegal values for the given whitelist.
 610          *
 611          * - Discards attributes not the given whitelist
 612          * - Unsafe style attributes are discarded
 613          * - Invalid id attributes are reencoded
 614          *
 615          * @param $attribs Array
 616          * @param $whitelist Array: list of allowed attribute names
 617          * @return Array
 618          *
 619          * @todo Check for legal values where the DTD limits things.
 620          * @todo Check for unique id attribute :P
 621          */
 622         static function validateAttributes( $attribs, $whitelist ) {
 623                 global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
 624
 625                 $whitelist = array_flip( $whitelist );
 626                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
 627
 628                 $out = array();
 629                 foreach( $attribs as $attribute => $value ) {
 630                         #allow XML namespace declaration if RDFa is enabled
 631                         if ( $wgAllowRdfaAttributes && preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) {
 632                                 if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
 633                                         $out[$attribute] = $value;
 634                                 }
 635
 636                                 continue;
 637                         }
 638
 639                         if( !isset( $whitelist[$attribute] ) ) {
 640                                 continue;
 641                         }
 642
 643                         # Strip javascript "expression" from stylesheets.
 644                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 645                         if( $attribute == 'style' ) {
 646                                 $value = Sanitizer::checkCss( $value );
 647                                 if( $value === false ) {
 648                                         # haxx0r
 649                                         continue;
 650                                 }
 651                         }
 652
 653                         if ( $attribute === 'id' ) {
 654                                 global $wgEnforceHtmlIds;
 655                                 $value = Sanitizer::escapeId( $value,
 656                                         $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
 657                         }
 658
 659                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
 660                         if ( $attribute === 'rel' || $attribute === 'rev' ||
 661                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
 662                                 $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
 663                                 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
 664                                 $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
 665
 666                                 //Paranoia. Allow "simple" values but suppress javascript
 667                                 if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
 668                                         continue;
 669                                 }
 670                         }
 671
 672                         # NOTE: even though elements using href/src are not allowed directly, supply
 673                         #       validation code that can be used by tag hook handlers, etc
 674                         if ( $attribute === 'href' || $attribute === 'src' ) {
 675                                 if ( !preg_match( $hrefExp, $value ) ) {
 676                                         continue; //drop any href or src attributes not using an allowed protocol.
 677                                                   //NOTE: this also drops all relative URLs
 678                                 }
 679                         }
 680
 681                         // If this attribute was previously set, override it.
 682                         // Output should only have one attribute of each name.
 683                         $out[$attribute] = $value;
 684                 }
 685
 686                 if ( $wgAllowMicrodataAttributes ) {
 687                         # There are some complicated validity constraints we need to
 688                         # enforce here.  First of all, we don't want to allow non-standard
 689                         # itemtypes.
 690                         $allowedTypes = array(
 691                                 'http://microformats.org/profile/hcard',
 692                                 'http://microformats.org/profile/hcalendar#vevent',
 693                                 'http://n.whatwg.org/work',
 694                         );
 695                         if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'],
 696                         $allowedTypes ) ) {
 697                                 # Kill everything
 698                                 unset( $out['itemscope'] );
 699                         }
 700                         # itemtype, itemid, itemref don't make sense without itemscope
 701                         if ( !array_key_exists( 'itemscope', $out ) ) {
 702                                 unset( $out['itemtype'] );
 703                                 unset( $out['itemid'] );
 704                                 unset( $out['itemref'] );
 705                         }
 706                         # TODO: Strip itemprop if we aren't descendants of an itemscope.
 707                 }
 708                 return $out;
 709         }
 710
 711         /**
 712          * Merge two sets of HTML attributes.  Conflicting items in the second set
 713          * will override those in the first, except for 'class' attributes which
 714          * will be combined (if they're both strings).
 715          *
 716          * @todo implement merging for other attributes such as style
 717          * @param $a Array
 718          * @param $b Array
 719          * @return array
 720          */
 721         static function mergeAttributes( $a, $b ) {
 722                 $out = array_merge( $a, $b );
 723                 if( isset( $a['class'] ) && isset( $b['class'] )
 724                 && is_string( $a['class'] ) && is_string( $b['class'] )
 725                 && $a['class'] !== $b['class'] ) {
 726                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 727                                 -1, PREG_SPLIT_NO_EMPTY );
 728                         $out['class'] = implode( ' ', array_unique( $classes ) );
 729                 }
 730                 return $out;
 731         }
 732
 733         /**
 734          * Pick apart some CSS and check it for forbidden or unsafe structures.
 735          * Returns a sanitized string, or false if it was just too evil.
 736          *
 737          * Currently URL references, 'expression', 'tps' are forbidden.
 738          *
 739          * @param $value String
 740          * @return Mixed
 741          */
 742         static function checkCss( $value ) {
 743                 $stripped = Sanitizer::decodeCharReferences( $value );
 744
 745                 // Remove any comments; IE gets token splitting wrong
 746                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 747
 748                 $value = $stripped;
 749
 750                 // ... and continue checks
 751                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 752                         'codepointToUtf8(hexdec("$1"))', $stripped );
 753                 $stripped = str_replace( '\\', '', $stripped );
 754                 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
 755                                 $stripped ) ) {
 756                         # haxx0r
 757                         return false;
 758                 }
 759
 760                 return $value;
 761         }
 762
 763         /**
 764          * Take a tag soup fragment listing an HTML element's attributes
 765          * and normalize it to well-formed XML, discarding unwanted attributes.
 766          * Output is safe for further wikitext processing, with escaping of
 767          * values that could trigger problems.
 768          *
 769          * - Normalizes attribute names to lowercase
 770          * - Discards attributes not on a whitelist for the given element
 771          * - Turns broken or invalid entities into plaintext
 772          * - Double-quotes all attribute values
 773          * - Attributes without values are given the name as attribute
 774          * - Double attributes are discarded
 775          * - Unsafe style attributes are discarded
 776          * - Prepends space if there are attributes.
 777          *
 778          * @param $text String
 779          * @param $element String
 780          * @return String
 781          */
 782         static function fixTagAttributes( $text, $element ) {
 783                 if( trim( $text ) == '' ) {
 784                         return '';
 785                 }
 786
 787                 $stripped = Sanitizer::validateTagAttributes(
 788                         Sanitizer::decodeTagAttributes( $text ), $element );
 789
 790                 $attribs = array();
 791                 foreach( $stripped as $attribute => $value ) {
 792                         $encAttribute = htmlspecialchars( $attribute );
 793                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 794
 795                         $attribs[] = "$encAttribute=\"$encValue\"";
 796                 }
 797                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 798         }
 799
 800         /**
 801          * Encode an attribute value for HTML output.
 802          * @param $text String
 803          * @return HTML-encoded text fragment
 804          */
 805         static function encodeAttribute( $text ) {
 806                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 807
 808                 // Whitespace is normalized during attribute decoding,
 809                 // so if we've been passed non-spaces we must encode them
 810                 // ahead of time or they won't be preserved.
 811                 $encValue = strtr( $encValue, array(
 812                         "\n" => '&#10;',
 813                         "\r" => '&#13;',
 814                         "\t" => '&#9;',
 815                 ) );
 816
 817                 return $encValue;
 818         }
 819
 820         /**
 821          * Encode an attribute value for HTML tags, with extra armoring
 822          * against further wiki processing.
 823          * @param $text String
 824          * @return HTML-encoded text fragment
 825          */
 826         static function safeEncodeAttribute( $text ) {
 827                 $encValue = Sanitizer::encodeAttribute( $text );
 828
 829                 # Templates and links may be expanded in later parsing,
 830                 # creating invalid or dangerous output. Suppress this.
 831                 $encValue = strtr( $encValue, array(
 832                         '<'    => '&lt;',   // This should never happen,
 833                         '>'    => '&gt;',   // we've received invalid input
 834                         '"'    => '&quot;', // which should have been escaped.
 835                         '{'    => '&#123;',
 836                         '['    => '&#91;',
 837                         "''"   => '&#39;&#39;',
 838                         'ISBN' => '&#73;SBN',
 839                         'RFC'  => '&#82;FC',
 840                         'PMID' => '&#80;MID',
 841                         '|'    => '&#124;',
 842                         '__'   => '&#95;_',
 843                 ) );
 844
 845                 # Stupid hack
 846                 $encValue = preg_replace_callback(
 847                         '/(' . wfUrlProtocols() . ')/',
 848                         array( 'Sanitizer', 'armorLinksCallback' ),
 849                         $encValue );
 850                 return $encValue;
 851         }
 852
 853         /**
 854          * Given a value escape it so that it can be used in an id attribute and
 855          * return it, this does not validate the value however (see first link)
 856          *
 857          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 858          *                                                          in the id and
 859          *                                                          name attributes
 860          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 861          *
 862          * @param $id String: id to validate
 863          * @param $options Mixed: string or array of strings (default is array()):
 864          *   'noninitial': This is a non-initial fragment of an id, not a full id,
 865          *       so don't pay attention if the first character isn't valid at the
 866          *       beginning of an id.
 867          *   'xml': Don't restrict the id to be HTML4-compatible.  This option
 868          *       allows any alphabetic character to be used, per the XML standard.
 869          *       Therefore, it also completely changes the type of escaping: instead
 870          *       of weird dot-encoding, runs of invalid characters (mostly
 871          *       whitespace) are just compressed into a single underscore.
 872          * @return String
 873          */
 874         static function escapeId( $id, $options = array() ) {
 875                 $options = (array)$options;
 876
 877                 if ( !in_array( 'xml', $options ) ) {
 878                         # HTML4-style escaping
 879                         static $replace = array(
 880                                 '%3A' => ':',
 881                                 '%' => '.'
 882                         );
 883
 884                         $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 885                         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 886
 887                         if ( !preg_match( '/^[a-zA-Z]/', $id )
 888                         && !in_array( 'noninitial', $options ) )  {
 889                                 // Initial character must be a letter!
 890                                 $id = "x$id";
 891                         }
 892                         return $id;
 893                 }
 894
 895                 # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
 896                 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
 897                 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
 898                         . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
 899                         . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
 900                 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
 901                         . '\x{203F}-\x{2040}';
 902                 # Replace _ as well so we don't get multiple consecutive underscores
 903                 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
 904                 $id = trim( $id, '_' );
 905
 906                 if ( !preg_match( "/^[$nameStartChar]/u", $id )
 907                 && !in_array( 'noninitial', $options ) ) {
 908                         $id = "_$id";
 909                 }
 910
 911                 return $id;
 912         }
 913
 914         /**
 915          * Given a value, escape it so that it can be used as a CSS class and
 916          * return it.
 917          *
 918          * @todo For extra validity, input should be validated UTF-8.
 919          *
 920          * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 921          *
 922          * @param $class String
 923          * @return String
 924          */
 925         static function escapeClass( $class ) {
 926                 // Convert ugly stuff to underscores and kill underscores in ugly places
 927                 return rtrim(preg_replace(
 928                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 929                         '_',
 930                         $class ), '_');
 931         }
 932
 933         /**
 934          * Given HTML input, escape with htmlspecialchars but un-escape entites.
 935          * This allows (generally harmless) entities like &nbsp; to survive.
 936          *
 937          * @param $html String to escape
 938          * @return String: escaped input
 939          */
 940         static function escapeHtmlAllowEntities( $html ) {
 941                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
 942                 # hurt.
 943                 $html = htmlspecialchars( $html, ENT_QUOTES );
 944                 $html = str_replace( '&amp;', '&', $html );
 945                 $html = Sanitizer::normalizeCharReferences( $html );
 946                 return $html;
 947         }
 948
 949         /**
 950          * Regex replace callback for armoring links against further processing.
 951          * @param $matches Array
 952          * @return string
 953          */
 954         private static function armorLinksCallback( $matches ) {
 955                 return str_replace( ':', '&#58;', $matches[1] );
 956         }
 957
 958         /**
 959          * Return an associative array of attribute names and values from
 960          * a partial tag string. Attribute names are forces to lowercase,
 961          * character references are decoded to UTF-8 text.
 962          *
 963          * @param $text String
 964          * @return Array
 965          */
 966         public static function decodeTagAttributes( $text ) {
 967                 if( trim( $text ) == '' ) {
 968                         return array();
 969                 }
 970
 971                 $attribs = array();
 972                 $pairs = array();
 973                 if( !preg_match_all(
 974                         MW_ATTRIBS_REGEX,
 975                         $text,
 976                         $pairs,
 977                         PREG_SET_ORDER ) ) {
 978                         return $attribs;
 979                 }
 980
 981                 foreach( $pairs as $set ) {
 982                         $attribute = strtolower( $set[1] );
 983                         $value = Sanitizer::getTagAttributeCallback( $set );
 984
 985                         // Normalize whitespace
 986                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 987                         $value = trim( $value );
 988
 989                         // Decode character references
 990                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 991                 }
 992                 return $attribs;
 993         }
 994
 995         /**
 996          * Pick the appropriate attribute value from a match set from the
 997          * MW_ATTRIBS_REGEX matches.
 998          *
 999          * @param $set Array
1000          * @return String
1001          */
1002         private static function getTagAttributeCallback( $set ) {
1003                 if( isset( $set[6] ) ) {
1004                         # Illegal #XXXXXX color with no quotes.
1005                         return $set[6];
1006                 } elseif( isset( $set[5] ) ) {
1007                         # No quotes.
1008                         return $set[5];
1009                 } elseif( isset( $set[4] ) ) {
1010                         # Single-quoted
1011                         return $set[4];
1012                 } elseif( isset( $set[3] ) ) {
1013                         # Double-quoted
1014                         return $set[3];
1015                 } elseif( !isset( $set[2] ) ) {
1016                         # In XHTML, attributes must have a value.
1017                         # For 'reduced' form, return explicitly the attribute name here.
1018                         return $set[1];
1019                 } else {
1020                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1021                 }
1022         }
1023
1024         /**
1025          * Normalize whitespace and character references in an XML source-
1026          * encoded text for an attribute value.
1027          *
1028          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
1029          * but note that we're not returning the value, but are returning
1030          * XML source fragments that will be slapped into output.
1031          *
1032          * @param $text String
1033          * @return String
1034          */
1035         private static function normalizeAttributeValue( $text ) {
1036                 return str_replace( '"', '&quot;',
1037                         self::normalizeWhitespace(
1038                                 Sanitizer::normalizeCharReferences( $text ) ) );
1039         }
1040
1041         private static function normalizeWhitespace( $text ) {
1042                 return preg_replace(
1043                         '/\r\n|[\x20\x0d\x0a\x09]/',
1044                         ' ',
1045                         $text );
1046         }
1047
1048         /**
1049          * Ensure that any entities and character references are legal
1050          * for XML and XHTML specifically. Any stray bits will be
1051          * &amp;-escaped to result in a valid text fragment.
1052          *
1053          * a. any named char refs must be known in XHTML
1054          * b. any numeric char refs must be legal chars, not invalid or forbidden
1055          * c. use &#x, not &#X
1056          * d. fix or reject non-valid attributes
1057          *
1058          * @param $text String
1059          * @return String
1060          * @private
1061          */
1062         static function normalizeCharReferences( $text ) {
1063                 return preg_replace_callback(
1064                         MW_CHAR_REFS_REGEX,
1065                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1066                         $text );
1067         }
1068         /**
1069          * @param $matches String
1070          * @return String
1071          */
1072         static function normalizeCharReferencesCallback( $matches ) {
1073                 $ret = null;
1074                 if( $matches[1] != '' ) {
1075                         $ret = Sanitizer::normalizeEntity( $matches[1] );
1076                 } elseif( $matches[2] != '' ) {
1077                         $ret = Sanitizer::decCharReference( $matches[2] );
1078                 } elseif( $matches[3] != ''  ) {
1079                         $ret = Sanitizer::hexCharReference( $matches[3] );
1080                 } elseif( $matches[4] != '' ) {
1081                         $ret = Sanitizer::hexCharReference( $matches[4] );
1082                 }
1083                 if( is_null( $ret ) ) {
1084                         return htmlspecialchars( $matches[0] );
1085                 } else {
1086                         return $ret;
1087                 }
1088         }
1089
1090         /**
1091          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1092          * return the named entity reference as is. If the entity is a
1093          * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
1094          * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
1095          *
1096          * @param $name String
1097          * @return String
1098          */
1099         static function normalizeEntity( $name ) {
1100                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1101                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1102                         return "&{$wgHtmlEntityAliases[$name]};";
1103                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
1104                         return "&$name;";
1105                 } else {
1106                         return "&amp;$name;";
1107                 }
1108         }
1109
1110         static function decCharReference( $codepoint ) {
1111                 $point = intval( $codepoint );
1112                 if( Sanitizer::validateCodepoint( $point ) ) {
1113                         return sprintf( '&#%d;', $point );
1114                 } else {
1115                         return null;
1116                 }
1117         }
1118
1119         static function hexCharReference( $codepoint ) {
1120                 $point = hexdec( $codepoint );
1121                 if( Sanitizer::validateCodepoint( $point ) ) {
1122                         return sprintf( '&#x%x;', $point );
1123                 } else {
1124                         return null;
1125                 }
1126         }
1127
1128         /**
1129          * Returns true if a given Unicode codepoint is a valid character in XML.
1130          * @param $codepoint Integer
1131          * @return Boolean
1132          */
1133         private static function validateCodepoint( $codepoint ) {
1134                 return ($codepoint ==    0x09)
1135                         || ($codepoint ==    0x0a)
1136                         || ($codepoint ==    0x0d)
1137                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
1138                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
1139                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1140         }
1141
1142         /**
1143          * Decode any character references, numeric or named entities,
1144          * in the text and return a UTF-8 string.
1145          *
1146          * @param $text String
1147          * @return String
1148          */
1149         public static function decodeCharReferences( $text ) {
1150                 return preg_replace_callback(
1151                         MW_CHAR_REFS_REGEX,
1152                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1153                         $text );
1154         }
1155
1156         /**
1157          * @param $matches String
1158          * @return String
1159          */
1160         static function decodeCharReferencesCallback( $matches ) {
1161                 if( $matches[1] != '' ) {
1162                         return Sanitizer::decodeEntity( $matches[1] );
1163                 } elseif( $matches[2] != '' ) {
1164                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
1165                 } elseif( $matches[3] != ''  ) {
1166                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
1167                 } elseif( $matches[4] != '' ) {
1168                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
1169                 }
1170                 # Last case should be an ampersand by itself
1171                 return $matches[0];
1172         }
1173
1174         /**
1175          * Return UTF-8 string for a codepoint if that is a valid
1176          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1177          * @param $codepoint Integer
1178          * @return String
1179          * @private
1180          */
1181         static function decodeChar( $codepoint ) {
1182                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1183                         return codepointToUtf8( $codepoint );
1184                 } else {
1185                         return UTF8_REPLACEMENT;
1186                 }
1187         }
1188
1189         /**
1190          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1191          * return the UTF-8 encoding of that character. Otherwise, returns
1192          * pseudo-entity source (eg &foo;)
1193          *
1194          * @param $name Strings
1195          * @return String
1196          */
1197         static function decodeEntity( $name ) {
1198                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1199                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1200                         $name = $wgHtmlEntityAliases[$name];
1201                 }
1202                 if( isset( $wgHtmlEntities[$name] ) ) {
1203                         return codepointToUtf8( $wgHtmlEntities[$name] );
1204                 } else {
1205                         return "&$name;";
1206                 }
1207         }
1208
1209         /**
1210          * Fetch the whitelist of acceptable attributes for a given element name.
1211          *
1212          * @param $element String
1213          * @return Array
1214          */
1215         static function attributeWhitelist( $element ) {
1216                 static $list;
1217                 if( !isset( $list ) ) {
1218                         $list = Sanitizer::setupAttributeWhitelist();
1219                 }
1220                 return isset( $list[$element] )
1221                         ? $list[$element]
1222                         : array();
1223         }
1224
1225         /**
1226          * Foreach array key (an allowed HTML element), return an array
1227          * of allowed attributes
1228          * @return Array
1229          */
1230         static function setupAttributeWhitelist() {
1231                 global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
1232
1233                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style', 'xml:lang' );
1234
1235                 if ( $wgAllowRdfaAttributes ) {
1236                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1237                         $common = array_merge( $common, array(
1238                             'about', 'property', 'resource', 'datatype', 'typeof',
1239                         ) );
1240                 }
1241
1242                 if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
1243                         # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
1244                         $common = array_merge( $common, array(
1245                             'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
1246                         ) );
1247                 }
1248
1249                 $block = array_merge( $common, array( 'align' ) );
1250                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1251                 $tablecell = array( 'abbr',
1252                                     'axis',
1253                                     'headers',
1254                                     'scope',
1255                                     'rowspan',
1256                                     'colspan',
1257                                     'nowrap', # deprecated
1258                                     'width',  # deprecated
1259                                     'height', # deprecated
1260                                     'bgcolor' # deprecated
1261                                     );
1262
1263                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1264                 # See: http://www.w3.org/TR/html4/
1265                 $whitelist = array (
1266                         # 7.5.4
1267                         'div'        => $block,
1268                         'center'     => $common, # deprecated
1269                         'span'       => $block, # ??
1270
1271                         # 7.5.5
1272                         'h1'         => $block,
1273                         'h2'         => $block,
1274                         'h3'         => $block,
1275                         'h4'         => $block,
1276                         'h5'         => $block,
1277                         'h6'         => $block,
1278
1279                         # 7.5.6
1280                         # address
1281
1282                         # 8.2.4
1283                         # bdo
1284
1285                         # 9.2.1
1286                         'em'         => $common,
1287                         'strong'     => $common,
1288                         'cite'       => $common,
1289                         # dfn
1290                         'code'       => $common,
1291                         # samp
1292                         # kbd
1293                         'var'        => $common,
1294                         'abbr'       => $common,
1295                         # acronym
1296
1297                         # 9.2.2
1298                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1299                         # q
1300
1301                         # 9.2.3
1302                         'sub'        => $common,
1303                         'sup'        => $common,
1304
1305                         # 9.3.1
1306                         'p'          => $block,
1307
1308                         # 9.3.2
1309                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1310
1311                         # 9.3.4
1312                         'pre'        => array_merge( $common, array( 'width' ) ),
1313
1314                         # 9.4
1315                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1316                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1317
1318                         # 10.2
1319                         'ul'         => array_merge( $common, array( 'type' ) ),
1320                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1321                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1322
1323                         # 10.3
1324                         'dl'         => $common,
1325                         'dd'         => $common,
1326                         'dt'         => $common,
1327
1328                         # 11.2.1
1329                         'table'      => array_merge( $common,
1330                                                                 array( 'summary', 'width', 'border', 'frame',
1331                                                                                 'rules', 'cellspacing', 'cellpadding',
1332                                                                                 'align', 'bgcolor',
1333                                                                 ) ),
1334
1335                         # 11.2.2
1336                         'caption'    => array_merge( $common, array( 'align' ) ),
1337
1338                         # 11.2.3
1339                         'thead'      => array_merge( $common, $tablealign ),
1340                         'tfoot'      => array_merge( $common, $tablealign ),
1341                         'tbody'      => array_merge( $common, $tablealign ),
1342
1343                         # 11.2.4
1344                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1345                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1346
1347                         # 11.2.5
1348                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1349
1350                         # 11.2.6
1351                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1352                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1353
1354                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
1355                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1356
1357                         # 13.2
1358                         # Not usually allowed, but may be used for extension-style hooks
1359                         # such as <math> when it is rasterized
1360                         'img'        => array_merge( $common, array( 'alt' ) ),
1361
1362                         # 15.2.1
1363                         'tt'         => $common,
1364                         'b'          => $common,
1365                         'i'          => $common,
1366                         'big'        => $common,
1367                         'small'      => $common,
1368                         'strike'     => $common,
1369                         's'          => $common,
1370                         'u'          => $common,
1371
1372                         # 15.2.2
1373                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1374                         # basefont
1375
1376                         # 15.3
1377                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1378
1379                         # XHTML Ruby annotation text module, simple ruby only.
1380                         # http://www.w3c.org/TR/ruby/
1381                         'ruby'       => $common,
1382                         # rbc
1383                         # rtc
1384                         'rb'         => $common,
1385                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1386                         'rp'         => $common,
1387
1388                         # MathML root element, where used for extensions
1389                         # 'title' may not be 100% valid here; it's XHTML
1390                         # http://www.w3.org/TR/REC-MathML/
1391                         'math'       => array( 'class', 'style', 'id', 'title' ),
1392                         );
1393                 return $whitelist;
1394         }
1395
1396         /**
1397          * Take a fragment of (potentially invalid) HTML and return
1398          * a version with any tags removed, encoded as plain text.
1399          *
1400          * Warning: this return value must be further escaped for literal
1401          * inclusion in HTML output as of 1.10!
1402          *
1403          * @param $text String: HTML fragment
1404          * @return String
1405          */
1406         static function stripAllTags( $text ) {
1407                 # Actual <tags>
1408                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1409
1410                 # Normalize &entities and whitespace
1411                 $text = self::decodeCharReferences( $text );
1412                 $text = self::normalizeWhitespace( $text );
1413
1414                 return $text;
1415         }
1416
1417         /**
1418          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1419          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1420          * PHP 5.1 doesn't.
1421          *
1422          * Use for passing XHTML fragments to PHP's XML parsing functions
1423          *
1424          * @return String
1425          */
1426         static function hackDocType() {
1427                 global $wgHtmlEntities;
1428                 $out = "<!DOCTYPE html [\n";
1429                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1430                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1431                 }
1432                 $out .= "]>\n";
1433                 return $out;
1434         }
1435
1436         static function cleanUrl( $url ) {
1437                 # Normalize any HTML entities in input. They will be
1438                 # re-escaped by makeExternalLink().
1439                 $url = Sanitizer::decodeCharReferences( $url );
1440
1441                 # Escape any control characters introduced by the above step
1442                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1443
1444                 # Validate hostname portion
1445                 $matches = array();
1446                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1447                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1448
1449                         // Characters that will be ignored in IDNs.
1450                         // http://tools.ietf.org/html/3454#section-3.1
1451                         // Strip them before further processing so blacklists and such work.
1452                         $strip = "/
1453                                 \\s|          # general whitespace
1454                                 \xc2\xad|     # 00ad SOFT HYPHEN
1455                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1456                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1457                                 \xe2\x81\xa0| # 2060 WORD JOINER
1458                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1459                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1460                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1461                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1462                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1463                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1464                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1465                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1466                                 /xuD";
1467
1468                         $host = preg_replace( $strip, '', $host );
1469
1470                         // @todo Fixme: validate hostnames here
1471
1472                         return $protocol . $host . $rest;
1473                 } else {
1474                         return $url;
1475                 }
1476         }
1477
1478 }