includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9\x80-\xff]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /**
 320  * Character entity aliases accepted by MediaWiki
 321  */
 322 global $wgHtmlEntityAliases;
 323 $wgHtmlEntityAliases = array(
 324         'רלמ' => 'rlm',
 325         'رلم' => 'rlm',
 326 );
 327
 328
 329 /**
 330  * XHTML sanitizer for MediaWiki
 331  * @ingroup Parser
 332  */
 333 class Sanitizer {
 334         /**
 335          * Cleans up HTML, removes dangerous tags and attributes, and
 336          * removes HTML comments
 337          * @private
 338          * @param $text String
 339          * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
 340          * @param $args Array for the processing callback
 341          * @param $extratags Array for any extra tags to include
 342          * @param $removetags Array for any tags (default or extra) to exclude
 343          * @return string
 344          */
 345         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
 346                 global $wgUseTidy;
 347
 348                 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 349                         $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 350
 351                 wfProfileIn( __METHOD__ );
 352
 353                 if ( !$staticInitialised ) {
 354
 355                         $htmlpairsStatic = array( # Tags that must be closed
 356                                 'a', 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 357                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 358                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 359                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 360                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
 361                         );
 362                         $htmlsingle = array(
 363                                 'br', 'hr', 'li', 'dt', 'dd'
 364                         );
 365                         $htmlsingleonly = array( # Elements that cannot have close tags
 366                                 'br', 'hr'
 367                         );
 368                         $htmlnest = array( # Tags that can be nested--??
 369                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 370                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 371                         );
 372                         $tabletags = array( # Can only appear inside table, we will close them
 373                                 'td', 'th', 'tr',
 374                         );
 375                         $htmllist = array( # Tags used by list
 376                                 'ul','ol',
 377                         );
 378                         $listtags = array( # Tags that can appear in a list
 379                                 'li',
 380                         );
 381
 382                         $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
 383                         $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
 384
 385                         # Convert them all to hashtables for faster lookup
 386                         $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 387                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
 388                         foreach ( $vars as $var ) {
 389                                 $$var = array_flip( $$var );
 390                         }
 391                         $staticInitialised = true;
 392                 }
 393                 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
 394                 $extratags = array_flip( $extratags );
 395                 $removetags = array_flip( $removetags );
 396                 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
 397                 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
 398
 399                 # Remove HTML comments
 400                 $text = Sanitizer::removeHTMLcomments( $text );
 401                 $bits = explode( '<', $text );
 402                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 403                 if(!$wgUseTidy) {
 404                         $tagstack = $tablestack = array();
 405                         foreach ( $bits as $x ) {
 406                                 $regs = array();
 407                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 408                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 409                                 } else {
 410                                         $slash = $t = $params = $brace = $rest = null;
 411                                 }
 412
 413                                 $badtag = 0 ;
 414                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 415                                         # Check our stack
 416                                         if ( $slash ) {
 417                                                 # Closing a tag...
 418                                                 if( isset( $htmlsingleonly[$t] ) ) {
 419                                                         $badtag = 1;
 420                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 421                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 422                                                                 # Pop all elements with an optional close tag
 423                                                                 # and see if we find a match below them
 424                                                                 $optstack = array();
 425                                                                 array_push ($optstack, $ot);
 426                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 427                                                                                 isset( $htmlsingleallowed[$ot] ) )
 428                                                                 {
 429                                                                         array_push ($optstack, $ot);
 430                                                                 }
 431                                                                 if ( $t != $ot ) {
 432                                                                         # No match. Push the optinal elements back again
 433                                                                         $badtag = 1;
 434                                                                         while ( $ot = @array_pop( $optstack ) ) {
 435                                                                                 array_push( $tagstack, $ot );
 436                                                                         }
 437                                                                 }
 438                                                         } else {
 439                                                                 @array_push( $tagstack, $ot );
 440                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 441                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 442                                                                         $badtag = 1;
 443                                                                 }
 444                                                         }
 445                                                 } else {
 446                                                         if ( $t == 'table' ) {
 447                                                                 $tagstack = array_pop( $tablestack );
 448                                                         }
 449                                                 }
 450                                                 $newparams = '';
 451                                         } else {
 452                                                 # Keep track for later
 453                                                 if ( isset( $tabletags[$t] ) &&
 454                                                 ! in_array( 'table', $tagstack ) ) {
 455                                                         $badtag = 1;
 456                                                 } else if ( in_array( $t, $tagstack ) &&
 457                                                 ! isset( $htmlnest [$t ] ) ) {
 458                                                         $badtag = 1 ;
 459                                                 # Is it a self closed htmlpair ? (bug 5487)
 460                                                 } else if( $brace == '/>' &&
 461                                                 isset( $htmlpairs[$t] ) ) {
 462                                                         $badtag = 1;
 463                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 464                                                         # Hack to force empty tag for uncloseable elements
 465                                                         $brace = '/>';
 466                                                 } else if( isset( $htmlsingle[$t] ) ) {
 467                                                         # Hack to not close $htmlsingle tags
 468                                                         $brace = NULL;
 469                                                 } else if( isset( $tabletags[$t] )
 470                                                 &&  in_array($t ,$tagstack) ) {
 471                                                         // New table tag but forgot to close the previous one
 472                                                         $text .= "</$t>";
 473                                                 } else {
 474                                                         if ( $t == 'table' ) {
 475                                                                 array_push( $tablestack, $tagstack );
 476                                                                 $tagstack = array();
 477                                                         }
 478                                                         array_push( $tagstack, $t );
 479                                                 }
 480
 481                                                 # Replace any variables or template parameters with
 482                                                 # plaintext results.
 483                                                 if( is_callable( $processCallback ) ) {
 484                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 485                                                 }
 486
 487                                                 # Strip non-approved attributes from the tag
 488                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 489                                         }
 490                                         if ( ! $badtag ) {
 491                                                 $rest = str_replace( '>', '&gt;', $rest );
 492                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 493                                                 $text .= "<$slash$t$newparams$close>$rest";
 494                                                 continue;
 495                                         }
 496                                 }
 497                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 498                         }
 499                         # Close off any remaining tags
 500                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 501                                 $text .= "</$t>\n";
 502                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 503                         }
 504                 } else {
 505                         # this might be possible using tidy itself
 506                         foreach ( $bits as $x ) {
 507                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 508                                 $x, $regs );
 509                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 510                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 511                                         if( is_callable( $processCallback ) ) {
 512                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 513                                         }
 514                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 515                                         $rest = str_replace( '>', '&gt;', $rest );
 516                                         $text .= "<$slash$t$newparams$brace$rest";
 517                                 } else {
 518                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 519                                 }
 520                         }
 521                 }
 522                 wfProfileOut( __METHOD__ );
 523                 return $text;
 524         }
 525
 526         /**
 527          * Remove '<!--', '-->', and everything between.
 528          * To avoid leaving blank lines, when a comment is both preceded
 529          * and followed by a newline (ignoring spaces), trim leading and
 530          * trailing spaces and one of the newlines.
 531          *
 532          * @private
 533          * @param $text String
 534          * @return string
 535          */
 536         static function removeHTMLcomments( $text ) {
 537                 wfProfileIn( __METHOD__ );
 538                 while (($start = strpos($text, '<!--')) !== false) {
 539                         $end = strpos($text, '-->', $start + 4);
 540                         if ($end === false) {
 541                                 # Unterminated comment; bail out
 542                                 break;
 543                         }
 544
 545                         $end += 3;
 546
 547                         # Trim space and newline if the comment is both
 548                         # preceded and followed by a newline
 549                         $spaceStart = max($start - 1, 0);
 550                         $spaceLen = $end - $spaceStart;
 551                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 552                                 $spaceStart--;
 553                                 $spaceLen++;
 554                         }
 555                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 556                                 $spaceLen++;
 557                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 558                                 # Remove the comment, leading and trailing
 559                                 # spaces, and leave only one newline.
 560                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 561                         }
 562                         else {
 563                                 # Remove just the comment.
 564                                 $text = substr_replace($text, '', $start, $end - $start);
 565                         }
 566                 }
 567                 wfProfileOut( __METHOD__ );
 568                 return $text;
 569         }
 570
 571         /**
 572          * Take an array of attribute names and values and normalize or discard
 573          * illegal values for the given element type.
 574          *
 575          * - Discards attributes not on a whitelist for the given element
 576          * - Unsafe style attributes are discarded
 577          * - Invalid id attributes are reencoded
 578          *
 579          * @param $attribs Array
 580          * @param $element String
 581          * @return Array
 582          *
 583          * @todo Check for legal values where the DTD limits things.
 584          * @todo Check for unique id attribute :P
 585          */
 586         static function validateTagAttributes( $attribs, $element ) {
 587                 return Sanitizer::validateAttributes( $attribs,
 588                         Sanitizer::attributeWhitelist( $element ) );
 589         }
 590
 591         /**
 592          * Take an array of attribute names and values and normalize or discard
 593          * illegal values for the given whitelist.
 594          *
 595          * - Discards attributes not the given whitelist
 596          * - Unsafe style attributes are discarded
 597          * - Invalid id attributes are reencoded
 598          *
 599          * @param $attribs Array
 600          * @param $whitelist Array: list of allowed attribute names
 601          * @return Array
 602          *
 603          * @todo Check for legal values where the DTD limits things.
 604          * @todo Check for unique id attribute :P
 605          */
 606         static function validateAttributes( $attribs, $whitelist ) {
 607                 $whitelist = array_flip( $whitelist );
 608                 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
 609
 610                 $out = array();
 611                 foreach( $attribs as $attribute => $value ) {
 612                         if( !isset( $whitelist[$attribute] ) ) {
 613                                 continue;
 614                         }
 615                         # Strip javascript "expression" from stylesheets.
 616                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 617                         if( $attribute == 'style' ) {
 618                                 $value = Sanitizer::checkCss( $value );
 619                                 if( $value === false ) {
 620                                         # haxx0r
 621                                         continue;
 622                                 }
 623                         }
 624
 625                         if ( $attribute === 'id' ) {
 626                                 global $wgEnforceHtmlIds;
 627                                 $value = Sanitizer::escapeId( $value,
 628                                         $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
 629                         }
 630
 631                         if ( $attribute === 'href' || $attribute === 'src' ) {
 632                                 if ( !preg_match( $hrefExp, $value ) ) {
 633                                         continue; //drop any href or src attributes not using an allowed protocol.
 634                                                   //NOTE: this also drops all relative URLs
 635                                 }
 636                         }
 637
 638                         //RDFa properties allow URIs. check them
 639                         if ( $attribute === 'rel' || $attribute === 'rev' ||
 640                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' ||
 641                                 $attribute === 'datatype' || $attribute === 'typeof' ) {
 642                                 //Paranoia. Allow "simple" values but suppress javascript
 643                                 if ( preg_match( '/(^|\s)javascript\s*:/i', $value ) ) {
 644                                         continue;
 645                                 }
 646                         }
 647
 648                         // If this attribute was previously set, override it.
 649                         // Output should only have one attribute of each name.
 650                         $out[$attribute] = $value;
 651                 }
 652                 return $out;
 653         }
 654
 655         /**
 656          * Merge two sets of HTML attributes.  Conflicting items in the second set
 657          * will override those in the first, except for 'class' attributes which
 658          * will be combined (if they're both strings).
 659          *
 660          * @todo implement merging for other attributes such as style
 661          * @param $a Array
 662          * @param $b Array
 663          * @return array
 664          */
 665         static function mergeAttributes( $a, $b ) {
 666                 $out = array_merge( $a, $b );
 667                 if( isset( $a['class'] ) && isset( $b['class'] )
 668                 && is_string( $a['class'] ) && is_string( $b['class'] )
 669                 && $a['class'] !== $b['class'] ) {
 670                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 671                                 -1, PREG_SPLIT_NO_EMPTY );
 672                         $out['class'] = implode( ' ', array_unique( $classes ) );
 673                 }
 674                 return $out;
 675         }
 676
 677         /**
 678          * Pick apart some CSS and check it for forbidden or unsafe structures.
 679          * Returns a sanitized string, or false if it was just too evil.
 680          *
 681          * Currently URL references, 'expression', 'tps' are forbidden.
 682          *
 683          * @param $value String
 684          * @return Mixed
 685          */
 686         static function checkCss( $value ) {
 687                 $stripped = Sanitizer::decodeCharReferences( $value );
 688
 689                 // Remove any comments; IE gets token splitting wrong
 690                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 691
 692                 $value = $stripped;
 693
 694                 // ... and continue checks
 695                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 696                         'codepointToUtf8(hexdec("$1"))', $stripped );
 697                 $stripped = str_replace( '\\', '', $stripped );
 698                 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
 699                                 $stripped ) ) {
 700                         # haxx0r
 701                         return false;
 702                 }
 703
 704                 return $value;
 705         }
 706
 707         /**
 708          * Take a tag soup fragment listing an HTML element's attributes
 709          * and normalize it to well-formed XML, discarding unwanted attributes.
 710          * Output is safe for further wikitext processing, with escaping of
 711          * values that could trigger problems.
 712          *
 713          * - Normalizes attribute names to lowercase
 714          * - Discards attributes not on a whitelist for the given element
 715          * - Turns broken or invalid entities into plaintext
 716          * - Double-quotes all attribute values
 717          * - Attributes without values are given the name as attribute
 718          * - Double attributes are discarded
 719          * - Unsafe style attributes are discarded
 720          * - Prepends space if there are attributes.
 721          *
 722          * @param $text String
 723          * @param $element String
 724          * @return String
 725          */
 726         static function fixTagAttributes( $text, $element ) {
 727                 if( trim( $text ) == '' ) {
 728                         return '';
 729                 }
 730
 731                 $stripped = Sanitizer::validateTagAttributes(
 732                         Sanitizer::decodeTagAttributes( $text ), $element );
 733
 734                 $attribs = array();
 735                 foreach( $stripped as $attribute => $value ) {
 736                         $encAttribute = htmlspecialchars( $attribute );
 737                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 738
 739                         $attribs[] = "$encAttribute=\"$encValue\"";
 740                 }
 741                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 742         }
 743
 744         /**
 745          * Encode an attribute value for HTML output.
 746          * @param $text String
 747          * @return HTML-encoded text fragment
 748          */
 749         static function encodeAttribute( $text ) {
 750                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 751
 752                 // Whitespace is normalized during attribute decoding,
 753                 // so if we've been passed non-spaces we must encode them
 754                 // ahead of time or they won't be preserved.
 755                 $encValue = strtr( $encValue, array(
 756                         "\n" => '&#10;',
 757                         "\r" => '&#13;',
 758                         "\t" => '&#9;',
 759                 ) );
 760
 761                 return $encValue;
 762         }
 763
 764         /**
 765          * Encode an attribute value for HTML tags, with extra armoring
 766          * against further wiki processing.
 767          * @param $text String
 768          * @return HTML-encoded text fragment
 769          */
 770         static function safeEncodeAttribute( $text ) {
 771                 $encValue = Sanitizer::encodeAttribute( $text );
 772
 773                 # Templates and links may be expanded in later parsing,
 774                 # creating invalid or dangerous output. Suppress this.
 775                 $encValue = strtr( $encValue, array(
 776                         '<'    => '&lt;',   // This should never happen,
 777                         '>'    => '&gt;',   // we've received invalid input
 778                         '"'    => '&quot;', // which should have been escaped.
 779                         '{'    => '&#123;',
 780                         '['    => '&#91;',
 781                         "''"   => '&#39;&#39;',
 782                         'ISBN' => '&#73;SBN',
 783                         'RFC'  => '&#82;FC',
 784                         'PMID' => '&#80;MID',
 785                         '|'    => '&#124;',
 786                         '__'   => '&#95;_',
 787                 ) );
 788
 789                 # Stupid hack
 790                 $encValue = preg_replace_callback(
 791                         '/(' . wfUrlProtocols() . ')/',
 792                         array( 'Sanitizer', 'armorLinksCallback' ),
 793                         $encValue );
 794                 return $encValue;
 795         }
 796
 797         /**
 798          * Given a value escape it so that it can be used in an id attribute and
 799          * return it, this does not validate the value however (see first link)
 800          *
 801          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 802          *                                                          in the id and
 803          *                                                          name attributes
 804          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 805          *
 806          * @param $id String: id to validate
 807          * @param $options Mixed: string or array of strings (default is array()):
 808          *   'noninitial': This is a non-initial fragment of an id, not a full id,
 809          *       so don't pay attention if the first character isn't valid at the
 810          *       beginning of an id.
 811          *   'xml': Don't restrict the id to be HTML4-compatible.  This option
 812          *       allows any alphabetic character to be used, per the XML standard.
 813          *       Therefore, it also completely changes the type of escaping: instead
 814          *       of weird dot-encoding, runs of invalid characters (mostly
 815          *       whitespace) are just compressed into a single underscore.
 816          * @return String
 817          */
 818         static function escapeId( $id, $options = array() ) {
 819                 $options = (array)$options;
 820
 821                 if ( !in_array( 'xml', $options ) ) {
 822                         # HTML4-style escaping
 823                         static $replace = array(
 824                                 '%3A' => ':',
 825                                 '%' => '.'
 826                         );
 827
 828                         $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 829                         $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 830
 831                         if ( !preg_match( '/^[a-zA-Z]/', $id )
 832                         && !in_array( 'noninitial', $options ) )  {
 833                                 // Initial character must be a letter!
 834                                 $id = "x$id";
 835                         }
 836                         return $id;
 837                 }
 838
 839                 # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
 840                 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
 841                 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
 842                         . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
 843                         . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
 844                 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
 845                         . '\x{203F}-\x{2040}';
 846                 # Replace _ as well so we don't get multiple consecutive underscores
 847                 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
 848                 $id = trim( $id, '_' );
 849
 850                 if ( !preg_match( "/^[$nameStartChar]/u", $id )
 851                 && !in_array( 'noninitial', $options ) ) {
 852                         $id = "_$id";
 853                 }
 854
 855                 return $id;
 856         }
 857
 858         /**
 859          * Given a value, escape it so that it can be used as a CSS class and
 860          * return it.
 861          *
 862          * @todo For extra validity, input should be validated UTF-8.
 863          *
 864          * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 865          *
 866          * @param $class String
 867          * @return String
 868          */
 869         static function escapeClass( $class ) {
 870                 // Convert ugly stuff to underscores and kill underscores in ugly places
 871                 return rtrim(preg_replace(
 872                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 873                         '_',
 874                         $class ), '_');
 875         }
 876
 877         /**
 878          * Given HTML input, escape with htmlspecialchars but un-escape entites.
 879          * This allows (generally harmless) entities like &nbsp; to survive.
 880          *
 881          * @param $html String to escape
 882          * @return String: escaped input
 883          */
 884         static function escapeHtmlAllowEntities( $html ) {
 885                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
 886                 # hurt.
 887                 $html = htmlspecialchars( $html, ENT_QUOTES );
 888                 $html = str_replace( '&amp;', '&', $html );
 889                 $html = Sanitizer::normalizeCharReferences( $html );
 890                 return $html;
 891         }
 892
 893         /**
 894          * Regex replace callback for armoring links against further processing.
 895          * @param $matches Array
 896          * @return string
 897          */
 898         private static function armorLinksCallback( $matches ) {
 899                 return str_replace( ':', '&#58;', $matches[1] );
 900         }
 901
 902         /**
 903          * Return an associative array of attribute names and values from
 904          * a partial tag string. Attribute names are forces to lowercase,
 905          * character references are decoded to UTF-8 text.
 906          *
 907          * @param $text String
 908          * @return Array
 909          */
 910         public static function decodeTagAttributes( $text ) {
 911                 $attribs = array();
 912
 913                 if( trim( $text ) == '' ) {
 914                         return $attribs;
 915                 }
 916
 917                 $pairs = array();
 918                 if( !preg_match_all(
 919                         MW_ATTRIBS_REGEX,
 920                         $text,
 921                         $pairs,
 922                         PREG_SET_ORDER ) ) {
 923                         return $attribs;
 924                 }
 925
 926                 foreach( $pairs as $set ) {
 927                         $attribute = strtolower( $set[1] );
 928                         $value = Sanitizer::getTagAttributeCallback( $set );
 929
 930                         // Normalize whitespace
 931                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 932                         $value = trim( $value );
 933
 934                         // Decode character references
 935                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 936                 }
 937                 return $attribs;
 938         }
 939
 940         /**
 941          * Pick the appropriate attribute value from a match set from the
 942          * MW_ATTRIBS_REGEX matches.
 943          *
 944          * @param $set Array
 945          * @return String
 946          */
 947         private static function getTagAttributeCallback( $set ) {
 948                 if( isset( $set[6] ) ) {
 949                         # Illegal #XXXXXX color with no quotes.
 950                         return $set[6];
 951                 } elseif( isset( $set[5] ) ) {
 952                         # No quotes.
 953                         return $set[5];
 954                 } elseif( isset( $set[4] ) ) {
 955                         # Single-quoted
 956                         return $set[4];
 957                 } elseif( isset( $set[3] ) ) {
 958                         # Double-quoted
 959                         return $set[3];
 960                 } elseif( !isset( $set[2] ) ) {
 961                         # In XHTML, attributes must have a value.
 962                         # For 'reduced' form, return explicitly the attribute name here.
 963                         return $set[1];
 964                 } else {
 965                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 966                 }
 967         }
 968
 969         /**
 970          * Normalize whitespace and character references in an XML source-
 971          * encoded text for an attribute value.
 972          *
 973          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 974          * but note that we're not returning the value, but are returning
 975          * XML source fragments that will be slapped into output.
 976          *
 977          * @param $text String
 978          * @return String
 979          */
 980         private static function normalizeAttributeValue( $text ) {
 981                 return str_replace( '"', '&quot;',
 982                         self::normalizeWhitespace(
 983                                 Sanitizer::normalizeCharReferences( $text ) ) );
 984         }
 985
 986         private static function normalizeWhitespace( $text ) {
 987                 return preg_replace(
 988                         '/\r\n|[\x20\x0d\x0a\x09]/',
 989                         ' ',
 990                         $text );
 991         }
 992
 993         /**
 994          * Ensure that any entities and character references are legal
 995          * for XML and XHTML specifically. Any stray bits will be
 996          * &amp;-escaped to result in a valid text fragment.
 997          *
 998          * a. any named char refs must be known in XHTML
 999          * b. any numeric char refs must be legal chars, not invalid or forbidden
1000          * c. use &#x, not &#X
1001          * d. fix or reject non-valid attributes
1002          *
1003          * @param $text String
1004          * @return String
1005          * @private
1006          */
1007         static function normalizeCharReferences( $text ) {
1008                 return preg_replace_callback(
1009                         MW_CHAR_REFS_REGEX,
1010                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1011                         $text );
1012         }
1013         /**
1014          * @param $matches String
1015          * @return String
1016          */
1017         static function normalizeCharReferencesCallback( $matches ) {
1018                 $ret = null;
1019                 if( $matches[1] != '' ) {
1020                         $ret = Sanitizer::normalizeEntity( $matches[1] );
1021                 } elseif( $matches[2] != '' ) {
1022                         $ret = Sanitizer::decCharReference( $matches[2] );
1023                 } elseif( $matches[3] != ''  ) {
1024                         $ret = Sanitizer::hexCharReference( $matches[3] );
1025                 } elseif( $matches[4] != '' ) {
1026                         $ret = Sanitizer::hexCharReference( $matches[4] );
1027                 }
1028                 if( is_null( $ret ) ) {
1029                         return htmlspecialchars( $matches[0] );
1030                 } else {
1031                         return $ret;
1032                 }
1033         }
1034
1035         /**
1036          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1037          * return the named entity reference as is. If the entity is a
1038          * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
1039          * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
1040          *
1041          * @param $name String
1042          * @return String
1043          */
1044         static function normalizeEntity( $name ) {
1045                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1046                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1047                         return "&{$wgHtmlEntityAliases[$name]};";
1048                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
1049                         return "&$name;";
1050                 } else {
1051                         return "&amp;$name;";
1052                 }
1053         }
1054
1055         static function decCharReference( $codepoint ) {
1056                 $point = intval( $codepoint );
1057                 if( Sanitizer::validateCodepoint( $point ) ) {
1058                         return sprintf( '&#%d;', $point );
1059                 } else {
1060                         return null;
1061                 }
1062         }
1063
1064         static function hexCharReference( $codepoint ) {
1065                 $point = hexdec( $codepoint );
1066                 if( Sanitizer::validateCodepoint( $point ) ) {
1067                         return sprintf( '&#x%x;', $point );
1068                 } else {
1069                         return null;
1070                 }
1071         }
1072
1073         /**
1074          * Returns true if a given Unicode codepoint is a valid character in XML.
1075          * @param $codepoint Integer
1076          * @return Boolean
1077          */
1078         private static function validateCodepoint( $codepoint ) {
1079                 return ($codepoint ==    0x09)
1080                         || ($codepoint ==    0x0a)
1081                         || ($codepoint ==    0x0d)
1082                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
1083                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
1084                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1085         }
1086
1087         /**
1088          * Decode any character references, numeric or named entities,
1089          * in the text and return a UTF-8 string.
1090          *
1091          * @param $text String
1092          * @return String
1093          */
1094         public static function decodeCharReferences( $text ) {
1095                 return preg_replace_callback(
1096                         MW_CHAR_REFS_REGEX,
1097                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1098                         $text );
1099         }
1100
1101         /**
1102          * @param $matches String
1103          * @return String
1104          */
1105         static function decodeCharReferencesCallback( $matches ) {
1106                 if( $matches[1] != '' ) {
1107                         return Sanitizer::decodeEntity( $matches[1] );
1108                 } elseif( $matches[2] != '' ) {
1109                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
1110                 } elseif( $matches[3] != ''  ) {
1111                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
1112                 } elseif( $matches[4] != '' ) {
1113                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
1114                 }
1115                 # Last case should be an ampersand by itself
1116                 return $matches[0];
1117         }
1118
1119         /**
1120          * Return UTF-8 string for a codepoint if that is a valid
1121          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1122          * @param $codepoint Integer
1123          * @return String
1124          * @private
1125          */
1126         static function decodeChar( $codepoint ) {
1127                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1128                         return codepointToUtf8( $codepoint );
1129                 } else {
1130                         return UTF8_REPLACEMENT;
1131                 }
1132         }
1133
1134         /**
1135          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1136          * return the UTF-8 encoding of that character. Otherwise, returns
1137          * pseudo-entity source (eg &foo;)
1138          *
1139          * @param $name Strings
1140          * @return String
1141          */
1142         static function decodeEntity( $name ) {
1143                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1144                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1145                         $name = $wgHtmlEntityAliases[$name];
1146                 }
1147                 if( isset( $wgHtmlEntities[$name] ) ) {
1148                         return codepointToUtf8( $wgHtmlEntities[$name] );
1149                 } else {
1150                         return "&$name;";
1151                 }
1152         }
1153
1154         /**
1155          * Fetch the whitelist of acceptable attributes for a given element name.
1156          *
1157          * @param $element String
1158          * @return Array
1159          */
1160         static function attributeWhitelist( $element ) {
1161                 static $list;
1162                 if( !isset( $list ) ) {
1163                         $list = Sanitizer::setupAttributeWhitelist();
1164                 }
1165                 return isset( $list[$element] )
1166                         ? $list[$element]
1167                         : array();
1168         }
1169
1170         /**
1171          * Foreach array key (an allowed HTML element), return an array
1172          * of allowed attributes
1173          * @return Array
1174          */
1175         static function setupAttributeWhitelist() {
1176                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style',
1177                                  #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1178                                  'about', 'property', 'resource', 'datatype', 'typeof',
1179                                 );
1180
1181                 $block = array_merge( $common, array( 'align' ) );
1182                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1183                 $tablecell = array( 'abbr',
1184                                     'axis',
1185                                     'headers',
1186                                     'scope',
1187                                     'rowspan',
1188                                     'colspan',
1189                                     'nowrap', # deprecated
1190                                     'width',  # deprecated
1191                                     'height', # deprecated
1192                                     'bgcolor' # deprecated
1193                                     );
1194
1195                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1196                 # See: http://www.w3.org/TR/html4/
1197                 $whitelist = array (
1198                         # 7.5.4
1199                         'div'        => $block,
1200                         'center'     => $common, # deprecated
1201                         'span'       => $block, # ??
1202
1203                         # 7.5.5
1204                         'h1'         => $block,
1205                         'h2'         => $block,
1206                         'h3'         => $block,
1207                         'h4'         => $block,
1208                         'h5'         => $block,
1209                         'h6'         => $block,
1210
1211                         # 7.5.6
1212                         # address
1213
1214                         # 8.2.4
1215                         # bdo
1216
1217                         # 9.2.1
1218                         'em'         => $common,
1219                         'strong'     => $common,
1220                         'cite'       => $common,
1221                         # dfn
1222                         'code'       => $common,
1223                         # samp
1224                         # kbd
1225                         'var'        => $common,
1226                         'abbr'       => $common,
1227                         # acronym
1228
1229                         # 9.2.2
1230                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1231                         # q
1232
1233                         # 9.2.3
1234                         'sub'        => $common,
1235                         'sup'        => $common,
1236
1237                         # 9.3.1
1238                         'p'          => $block,
1239
1240                         # 9.3.2
1241                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1242
1243                         # 9.3.4
1244                         'pre'        => array_merge( $common, array( 'width' ) ),
1245
1246                         # 9.4
1247                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1248                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1249
1250                         # 10.2
1251                         'ul'         => array_merge( $common, array( 'type' ) ),
1252                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1253                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1254
1255                         # 10.3
1256                         'dl'         => $common,
1257                         'dd'         => $common,
1258                         'dt'         => $common,
1259
1260                         # 11.2.1
1261                         'table'      => array_merge( $common,
1262                                                                 array( 'summary', 'width', 'border', 'frame',
1263                                                                                 'rules', 'cellspacing', 'cellpadding',
1264                                                                                 'align', 'bgcolor',
1265                                                                 ) ),
1266
1267                         # 11.2.2
1268                         'caption'    => array_merge( $common, array( 'align' ) ),
1269
1270                         # 11.2.3
1271                         'thead'      => array_merge( $common, $tablealign ),
1272                         'tfoot'      => array_merge( $common, $tablealign ),
1273                         'tbody'      => array_merge( $common, $tablealign ),
1274
1275                         # 11.2.4
1276                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1277                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1278
1279                         # 11.2.5
1280                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1281
1282                         # 11.2.6
1283                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1284                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1285
1286                         # 12.2
1287                         'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1288
1289                         # 13.2
1290                         # Not usually allowed, but may be used for extension-style hooks
1291                         # such as <math> when it is rasterized
1292                         'img'        => array_merge( $common, array( 'alt' ) ),
1293
1294                         # 15.2.1
1295                         'tt'         => $common,
1296                         'b'          => $common,
1297                         'i'          => $common,
1298                         'big'        => $common,
1299                         'small'      => $common,
1300                         'strike'     => $common,
1301                         's'          => $common,
1302                         'u'          => $common,
1303
1304                         # 15.2.2
1305                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1306                         # basefont
1307
1308                         # 15.3
1309                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1310
1311                         # XHTML Ruby annotation text module, simple ruby only.
1312                         # http://www.w3c.org/TR/ruby/
1313                         'ruby'       => $common,
1314                         # rbc
1315                         # rtc
1316                         'rb'         => $common,
1317                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1318                         'rp'         => $common,
1319
1320                         # MathML root element, where used for extensions
1321                         # 'title' may not be 100% valid here; it's XHTML
1322                         # http://www.w3.org/TR/REC-MathML/
1323                         'math'       => array( 'class', 'style', 'id', 'title' ),
1324                         );
1325                 return $whitelist;
1326         }
1327
1328         /**
1329          * Take a fragment of (potentially invalid) HTML and return
1330          * a version with any tags removed, encoded as plain text.
1331          *
1332          * Warning: this return value must be further escaped for literal
1333          * inclusion in HTML output as of 1.10!
1334          *
1335          * @param $text String: HTML fragment
1336          * @return String
1337          */
1338         static function stripAllTags( $text ) {
1339                 # Actual <tags>
1340                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1341
1342                 # Normalize &entities and whitespace
1343                 $text = self::decodeCharReferences( $text );
1344                 $text = self::normalizeWhitespace( $text );
1345
1346                 return $text;
1347         }
1348
1349         /**
1350          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1351          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1352          * PHP 5.1 doesn't.
1353          *
1354          * Use for passing XHTML fragments to PHP's XML parsing functions
1355          *
1356          * @return String
1357          */
1358         static function hackDocType() {
1359                 global $wgHtmlEntities;
1360                 $out = "<!DOCTYPE html [\n";
1361                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1362                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1363                 }
1364                 $out .= "]>\n";
1365                 return $out;
1366         }
1367
1368         static function cleanUrl( $url ) {
1369                 # Normalize any HTML entities in input. They will be
1370                 # re-escaped by makeExternalLink().
1371                 $url = Sanitizer::decodeCharReferences( $url );
1372
1373                 # Escape any control characters introduced by the above step
1374                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1375
1376                 # Validate hostname portion
1377                 $matches = array();
1378                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1379                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1380
1381                         // Characters that will be ignored in IDNs.
1382                         // http://tools.ietf.org/html/3454#section-3.1
1383                         // Strip them before further processing so blacklists and such work.
1384                         $strip = "/
1385                                 \\s|          # general whitespace
1386                                 \xc2\xad|     # 00ad SOFT HYPHEN
1387                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1388                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1389                                 \xe2\x81\xa0| # 2060 WORD JOINER
1390                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1391                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1392                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1393                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1394                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1395                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1396                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1397                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1398                                 /xuD";
1399
1400                         $host = preg_replace( $strip, '', $host );
1401
1402                         // @fixme: validate hostnames here
1403
1404                         return $protocol . $host . $rest;
1405                 } else {
1406                         return $url;
1407                 }
1408         }
1409
1410 }