includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332
 333                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 334                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 335
 336                 wfProfileIn( __METHOD__ );
 337
 338                 if ( !$staticInitialised ) {
 339                         if( $wgUserHtml ) {
 340                                 $htmlpairs = array( # Tags that must be closed
 341                                         'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 342                                         'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 343                                         'strike', 'strong', 'tt', 'var', 'div', 'center',
 344                                         'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 345                                         'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 346                                 );
 347                                 $htmlsingle = array(
 348                                         'br', 'hr', 'li', 'dt', 'dd'
 349                                 );
 350                                 $htmlsingleonly = array( # Elements that cannot have close tags
 351                                         'br', 'hr'
 352                                 );
 353                                 $htmlnest = array( # Tags that can be nested--??
 354                                         'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 355                                         'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 356                                 );
 357                                 $tabletags = array( # Can only appear inside table
 358                                         'td', 'th', 'tr',
 359                                 );
 360                                 $htmllist = array( # Tags used by list
 361                                         'ul','ol',
 362                                 );
 363                                 $listtags = array( # Tags that can appear in a list
 364                                         'li',
 365                                 );
 366
 367                         } else {
 368                                 $htmlpairs = array();
 369                                 $htmlsingle = array();
 370                                 $htmlnest = array();
 371                                 $tabletags = array();
 372                         }
 373
 374                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 375                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 376
 377                         # Convert them all to hashtables for faster lookup
 378                         $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 379                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
 380                         foreach ( $vars as $var ) {
 381                                 $$var = array_flip( $$var );
 382                         }
 383                         $staticInitialised = true;
 384                 }
 385
 386                 # Remove HTML comments
 387                 $text = Sanitizer::removeHTMLcomments( $text );
 388                 $bits = explode( '<', $text );
 389                 $text = array_shift( $bits );
 390                 if(!$wgUseTidy) {
 391                         $tagstack = $tablestack = array();
 392                         foreach ( $bits as $x ) {
 393                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 394                                 preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs );
 395                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 396                                 error_reporting( $prev );
 397
 398                                 $badtag = 0 ;
 399                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 400                                         # Check our stack
 401                                         if ( $slash ) {
 402                                                 # Closing a tag...
 403                                                 if( isset( $htmlsingleonly[$t] ) ) {
 404                                                         $badtag = 1;
 405                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 406                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 407                                                                 # Pop all elements with an optional close tag
 408                                                                 # and see if we find a match below them
 409                                                                 $optstack = array();
 410                                                                 array_push ($optstack, $ot);
 411                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 412                                                                                 isset( $htmlsingleallowed[$ot] ) )
 413                                                                 {
 414                                                                         array_push ($optstack, $ot);
 415                                                                 }
 416                                                                 if ( $t != $ot ) {
 417                                                                         # No match. Push the optinal elements back again
 418                                                                         $badtag = 1;
 419                                                                         while ( $ot = @array_pop( $optstack ) ) {
 420                                                                                 array_push( $tagstack, $ot );
 421                                                                         }
 422                                                                 }
 423                                                         } else {
 424                                                                 @array_push( $tagstack, $ot );
 425                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 426                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 427                                                                         $badtag = 1;
 428                                                                 }
 429                                                         }
 430                                                 } else {
 431                                                         if ( $t == 'table' ) {
 432                                                                 $tagstack = array_pop( $tablestack );
 433                                                         }
 434                                                 }
 435                                                 $newparams = '';
 436                                         } else {
 437                                                 # Keep track for later
 438                                                 if ( isset( $tabletags[$t] ) &&
 439                                                 ! in_array( 'table', $tagstack ) ) {
 440                                                         $badtag = 1;
 441                                                 } else if ( in_array( $t, $tagstack ) &&
 442                                                 ! isset( $htmlnest [$t ] ) ) {
 443                                                         $badtag = 1 ;
 444                                                 # Is it a self closed htmlpair ? (bug 5487)
 445                                                 } else if( $brace == '/>' &&
 446                                                 isset( $htmlpairs[$t] ) ) {
 447                                                         $badtag = 1;
 448                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 449                                                         # Hack to force empty tag for uncloseable elements
 450                                                         $brace = '/>';
 451                                                 } else if( isset( $htmlsingle[$t] ) ) {
 452                                                         # Hack to not close $htmlsingle tags
 453                                                         $brace = NULL;
 454                                                 } else {
 455                                                         if ( $t == 'table' ) {
 456                                                                 array_push( $tablestack, $tagstack );
 457                                                                 $tagstack = array();
 458                                                         }
 459                                                         array_push( $tagstack, $t );
 460                                                 }
 461
 462                                                 # Replace any variables or template parameters with
 463                                                 # plaintext results.
 464                                                 if( is_callable( $processCallback ) ) {
 465                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 466                                                 }
 467
 468                                                 # Strip non-approved attributes from the tag
 469                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 470                                         }
 471                                         if ( ! $badtag ) {
 472                                                 $rest = str_replace( '>', '&gt;', $rest );
 473                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 474                                                 $text .= "<$slash$t$newparams$close>$rest";
 475                                                 continue;
 476                                         }
 477                                 }
 478                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 479                         }
 480                         # Close off any remaining tags
 481                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 482                                 $text .= "</$t>\n";
 483                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 484                         }
 485                 } else {
 486                         # this might be possible using tidy itself
 487                         foreach ( $bits as $x ) {
 488                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 489                                 $x, $regs );
 490                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 491                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 492                                         if( is_callable( $processCallback ) ) {
 493                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 494                                         }
 495                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 496                                         $rest = str_replace( '>', '&gt;', $rest );
 497                                         $text .= "<$slash$t$newparams$brace$rest";
 498                                 } else {
 499                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 500                                 }
 501                         }
 502                 }
 503                 wfProfileOut( __METHOD__ );
 504                 return $text;
 505         }
 506
 507         /**
 508          * Remove '<!--', '-->', and everything between.
 509          * To avoid leaving blank lines, when a comment is both preceded
 510          * and followed by a newline (ignoring spaces), trim leading and
 511          * trailing spaces and one of the newlines.
 512          *
 513          * @private
 514          * @param string $text
 515          * @return string
 516          */
 517         static function removeHTMLcomments( $text ) {
 518                 wfProfileIn( __METHOD__ );
 519                 while (($start = strpos($text, '<!--')) !== false) {
 520                         $end = strpos($text, '-->', $start + 4);
 521                         if ($end === false) {
 522                                 # Unterminated comment; bail out
 523                                 break;
 524                         }
 525
 526                         $end += 3;
 527
 528                         # Trim space and newline if the comment is both
 529                         # preceded and followed by a newline
 530                         $spaceStart = max($start - 1, 0);
 531                         $spaceLen = $end - $spaceStart;
 532                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 533                                 $spaceStart--;
 534                                 $spaceLen++;
 535                         }
 536                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 537                                 $spaceLen++;
 538                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 539                                 # Remove the comment, leading and trailing
 540                                 # spaces, and leave only one newline.
 541                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 542                         }
 543                         else {
 544                                 # Remove just the comment.
 545                                 $text = substr_replace($text, '', $start, $end - $start);
 546                         }
 547                 }
 548                 wfProfileOut( __METHOD__ );
 549                 return $text;
 550         }
 551
 552         /**
 553          * Take an array of attribute names and values and normalize or discard
 554          * illegal values for the given element type.
 555          *
 556          * - Discards attributes not on a whitelist for the given element
 557          * - Unsafe style attributes are discarded
 558          *
 559          * @param array $attribs
 560          * @param string $element
 561          * @return array
 562          *
 563          * @todo Check for legal values where the DTD limits things.
 564          * @todo Check for unique id attribute :P
 565          */
 566         static function validateTagAttributes( $attribs, $element ) {
 567                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 568                 $out = array();
 569                 foreach( $attribs as $attribute => $value ) {
 570                         if( !isset( $whitelist[$attribute] ) ) {
 571                                 continue;
 572                         }
 573                         # Strip javascript "expression" from stylesheets.
 574                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 575                         if( $attribute == 'style' ) {
 576                                 $value = Sanitizer::checkCss( $value );
 577                                 if( $value === false ) {
 578                                         # haxx0r
 579                                         continue;
 580                                 }
 581                         }
 582
 583                         if ( $attribute === 'id' )
 584                                 $value = Sanitizer::escapeId( $value );
 585
 586                         // If this attribute was previously set, override it.
 587                         // Output should only have one attribute of each name.
 588                         $out[$attribute] = $value;
 589                 }
 590                 return $out;
 591         }
 592
 593         /**
 594          * Pick apart some CSS and check it for forbidden or unsafe structures.
 595          * Returns a sanitized string, or false if it was just too evil.
 596          *
 597          * Currently URL references, 'expression', 'tps' are forbidden.
 598          *
 599          * @param string $value
 600          * @return mixed
 601          */
 602         static function checkCss( $value ) {
 603                 $stripped = Sanitizer::decodeCharReferences( $value );
 604
 605                 // Remove any comments; IE gets token splitting wrong
 606                 $stripped = StringUtils::delimiterReplace( '/\*', '\*/', ' ', $stripped );
 607
 608                 $value = $stripped;
 609
 610                 // ... and continue checks
 611                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 612                         'codepointToUtf8(hexdec("$1"))', $stripped );
 613                 $stripped = str_replace( '\\', '', $stripped );
 614                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 615                                 $stripped ) ) {
 616                         # haxx0r
 617                         return false;
 618                 }
 619
 620                 return $value;
 621         }
 622
 623         /**
 624          * Take a tag soup fragment listing an HTML element's attributes
 625          * and normalize it to well-formed XML, discarding unwanted attributes.
 626          * Output is safe for further wikitext processing, with escaping of
 627          * values that could trigger problems.
 628          *
 629          * - Normalizes attribute names to lowercase
 630          * - Discards attributes not on a whitelist for the given element
 631          * - Turns broken or invalid entities into plaintext
 632          * - Double-quotes all attribute values
 633          * - Attributes without values are given the name as attribute
 634          * - Double attributes are discarded
 635          * - Unsafe style attributes are discarded
 636          * - Prepends space if there are attributes.
 637          *
 638          * @param string $text
 639          * @param string $element
 640          * @return string
 641          */
 642         static function fixTagAttributes( $text, $element ) {
 643                 if( trim( $text ) == '' ) {
 644                         return '';
 645                 }
 646
 647                 $stripped = Sanitizer::validateTagAttributes(
 648                         Sanitizer::decodeTagAttributes( $text ), $element );
 649
 650                 $attribs = array();
 651                 foreach( $stripped as $attribute => $value ) {
 652                         $encAttribute = htmlspecialchars( $attribute );
 653                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 654
 655                         $attribs[] = "$encAttribute=\"$encValue\"";
 656                 }
 657                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 658         }
 659
 660         /**
 661          * Encode an attribute value for HTML output.
 662          * @param $text
 663          * @return HTML-encoded text fragment
 664          */
 665         static function encodeAttribute( $text ) {
 666                 $encValue = htmlspecialchars( $text );
 667
 668                 // Whitespace is normalized during attribute decoding,
 669                 // so if we've been passed non-spaces we must encode them
 670                 // ahead of time or they won't be preserved.
 671                 $encValue = strtr( $encValue, array(
 672                         "\n" => '&#10;',
 673                         "\r" => '&#13;',
 674                         "\t" => '&#9;',
 675                 ) );
 676
 677                 return $encValue;
 678         }
 679
 680         /**
 681          * Encode an attribute value for HTML tags, with extra armoring
 682          * against further wiki processing.
 683          * @param $text
 684          * @return HTML-encoded text fragment
 685          */
 686         static function safeEncodeAttribute( $text ) {
 687                 $encValue = Sanitizer::encodeAttribute( $text );
 688
 689                 # Templates and links may be expanded in later parsing,
 690                 # creating invalid or dangerous output. Suppress this.
 691                 $encValue = strtr( $encValue, array(
 692                         '<'    => '&lt;',   // This should never happen,
 693                         '>'    => '&gt;',   // we've received invalid input
 694                         '"'    => '&quot;', // which should have been escaped.
 695                         '{'    => '&#123;',
 696                         '['    => '&#91;',
 697                         "''"   => '&#39;&#39;',
 698                         'ISBN' => '&#73;SBN',
 699                         'RFC'  => '&#82;FC',
 700                         'PMID' => '&#80;MID',
 701                         '|'    => '&#124;',
 702                         '__'   => '&#95;_',
 703                 ) );
 704
 705                 # Stupid hack
 706                 $encValue = preg_replace_callback(
 707                         '/(' . wfUrlProtocols() . ')/',
 708                         array( 'Sanitizer', 'armorLinksCallback' ),
 709                         $encValue );
 710                 return $encValue;
 711         }
 712
 713         /**
 714          * Given a value escape it so that it can be used in an id attribute and
 715          * return it, this does not validate the value however (see first link)
 716          *
 717          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 718          *                                                          in the id and
 719          *                                                          name attributes
 720          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 721          *
 722          * @bug 4461
 723          *
 724          * @static
 725          *
 726          * @param string $id
 727          * @return string
 728          */
 729         static function escapeId( $id ) {
 730                 static $replace = array(
 731                         '%3A' => ':',
 732                         '%' => '.'
 733                 );
 734
 735                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 736
 737                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 738         }
 739
 740         /**
 741          * Given a value, escape it so that it can be used as a CSS class and
 742          * return it.
 743          *
 744          * TODO: For extra validity, input should be validated UTF-8.
 745          *
 746          * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 747          *
 748          * @param string $class
 749          * @return string
 750          */
 751         static function escapeClass( $class ) {
 752                 // Convert ugly stuff to underscores and kill underscores in ugly places
 753                 return rtrim(preg_replace(
 754                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 755                         '_',
 756                         $class ), '_');
 757         }
 758
 759         /**
 760          * Regex replace callback for armoring links against further processing.
 761          * @param array $matches
 762          * @return string
 763          * @private
 764          */
 765         private static function armorLinksCallback( $matches ) {
 766                 return str_replace( ':', '&#58;', $matches[1] );
 767         }
 768
 769         /**
 770          * Return an associative array of attribute names and values from
 771          * a partial tag string. Attribute names are forces to lowercase,
 772          * character references are decoded to UTF-8 text.
 773          *
 774          * @param string
 775          * @return array
 776          */
 777         static function decodeTagAttributes( $text ) {
 778                 $attribs = array();
 779
 780                 if( trim( $text ) == '' ) {
 781                         return $attribs;
 782                 }
 783
 784                 $pairs = array();
 785                 if( !preg_match_all(
 786                         MW_ATTRIBS_REGEX,
 787                         $text,
 788                         $pairs,
 789                         PREG_SET_ORDER ) ) {
 790                         return $attribs;
 791                 }
 792
 793                 foreach( $pairs as $set ) {
 794                         $attribute = strtolower( $set[1] );
 795                         $value = Sanitizer::getTagAttributeCallback( $set );
 796
 797                         // Normalize whitespace
 798                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 799                         $value = trim( $value );
 800
 801                         // Decode character references
 802                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 803                 }
 804                 return $attribs;
 805         }
 806
 807         /**
 808          * Pick the appropriate attribute value from a match set from the
 809          * MW_ATTRIBS_REGEX matches.
 810          *
 811          * @param array $set
 812          * @return string
 813          * @private
 814          */
 815         private static function getTagAttributeCallback( $set ) {
 816                 if( isset( $set[6] ) ) {
 817                         # Illegal #XXXXXX color with no quotes.
 818                         return $set[6];
 819                 } elseif( isset( $set[5] ) ) {
 820                         # No quotes.
 821                         return $set[5];
 822                 } elseif( isset( $set[4] ) ) {
 823                         # Single-quoted
 824                         return $set[4];
 825                 } elseif( isset( $set[3] ) ) {
 826                         # Double-quoted
 827                         return $set[3];
 828                 } elseif( !isset( $set[2] ) ) {
 829                         # In XHTML, attributes must have a value.
 830                         # For 'reduced' form, return explicitly the attribute name here.
 831                         return $set[1];
 832                 } else {
 833                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 834                 }
 835         }
 836
 837         /**
 838          * Normalize whitespace and character references in an XML source-
 839          * encoded text for an attribute value.
 840          *
 841          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 842          * but note that we're not returning the value, but are returning
 843          * XML source fragments that will be slapped into output.
 844          *
 845          * @param string $text
 846          * @return string
 847          * @private
 848          */
 849         private static function normalizeAttributeValue( $text ) {
 850                 return str_replace( '"', '&quot;',
 851                         preg_replace(
 852                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 853                                 ' ',
 854                                 Sanitizer::normalizeCharReferences( $text ) ) );
 855         }
 856
 857         /**
 858          * Ensure that any entities and character references are legal
 859          * for XML and XHTML specifically. Any stray bits will be
 860          * &amp;-escaped to result in a valid text fragment.
 861          *
 862          * a. any named char refs must be known in XHTML
 863          * b. any numeric char refs must be legal chars, not invalid or forbidden
 864          * c. use &#x, not &#X
 865          * d. fix or reject non-valid attributes
 866          *
 867          * @param string $text
 868          * @return string
 869          * @private
 870          */
 871         static function normalizeCharReferences( $text ) {
 872                 return preg_replace_callback(
 873                         MW_CHAR_REFS_REGEX,
 874                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 875                         $text );
 876         }
 877         /**
 878          * @param string $matches
 879          * @return string
 880          */
 881         static function normalizeCharReferencesCallback( $matches ) {
 882                 $ret = null;
 883                 if( $matches[1] != '' ) {
 884                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 885                 } elseif( $matches[2] != '' ) {
 886                         $ret = Sanitizer::decCharReference( $matches[2] );
 887                 } elseif( $matches[3] != ''  ) {
 888                         $ret = Sanitizer::hexCharReference( $matches[3] );
 889                 } elseif( $matches[4] != '' ) {
 890                         $ret = Sanitizer::hexCharReference( $matches[4] );
 891                 }
 892                 if( is_null( $ret ) ) {
 893                         return htmlspecialchars( $matches[0] );
 894                 } else {
 895                         return $ret;
 896                 }
 897         }
 898
 899         /**
 900          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 901          * return the named entity reference as is. Otherwise, returns
 902          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 903          *
 904          * @param string $name
 905          * @return string
 906          * @static
 907          */
 908         static function normalizeEntity( $name ) {
 909                 global $wgHtmlEntities;
 910                 if( isset( $wgHtmlEntities[$name] ) ) {
 911                         return "&$name;";
 912                 } else {
 913                         return "&amp;$name;";
 914                 }
 915         }
 916
 917         static function decCharReference( $codepoint ) {
 918                 $point = intval( $codepoint );
 919                 if( Sanitizer::validateCodepoint( $point ) ) {
 920                         return sprintf( '&#%d;', $point );
 921                 } else {
 922                         return null;
 923                 }
 924         }
 925
 926         static function hexCharReference( $codepoint ) {
 927                 $point = hexdec( $codepoint );
 928                 if( Sanitizer::validateCodepoint( $point ) ) {
 929                         return sprintf( '&#x%x;', $point );
 930                 } else {
 931                         return null;
 932                 }
 933         }
 934
 935         /**
 936          * Returns true if a given Unicode codepoint is a valid character in XML.
 937          * @param int $codepoint
 938          * @return bool
 939          */
 940         private static function validateCodepoint( $codepoint ) {
 941                 return ($codepoint ==    0x09)
 942                         || ($codepoint ==    0x0a)
 943                         || ($codepoint ==    0x0d)
 944                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 945                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 946                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 947         }
 948
 949         /**
 950          * Decode any character references, numeric or named entities,
 951          * in the text and return a UTF-8 string.
 952          *
 953          * @param string $text
 954          * @return string
 955          * @public
 956          * @static
 957          */
 958         public static function decodeCharReferences( $text ) {
 959                 return preg_replace_callback(
 960                         MW_CHAR_REFS_REGEX,
 961                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 962                         $text );
 963         }
 964
 965         /**
 966          * @param string $matches
 967          * @return string
 968          */
 969         static function decodeCharReferencesCallback( $matches ) {
 970                 if( $matches[1] != '' ) {
 971                         return Sanitizer::decodeEntity( $matches[1] );
 972                 } elseif( $matches[2] != '' ) {
 973                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 974                 } elseif( $matches[3] != ''  ) {
 975                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 976                 } elseif( $matches[4] != '' ) {
 977                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 978                 }
 979                 # Last case should be an ampersand by itself
 980                 return $matches[0];
 981         }
 982
 983         /**
 984          * Return UTF-8 string for a codepoint if that is a valid
 985          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 986          * @param int $codepoint
 987          * @return string
 988          * @private
 989          */
 990         static function decodeChar( $codepoint ) {
 991                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 992                         return codepointToUtf8( $codepoint );
 993                 } else {
 994                         return UTF8_REPLACEMENT;
 995                 }
 996         }
 997
 998         /**
 999          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1000          * return the UTF-8 encoding of that character. Otherwise, returns
1001          * pseudo-entity source (eg &foo;)
1002          *
1003          * @param string $name
1004          * @return string
1005          */
1006         static function decodeEntity( $name ) {
1007                 global $wgHtmlEntities;
1008                 if( isset( $wgHtmlEntities[$name] ) ) {
1009                         return codepointToUtf8( $wgHtmlEntities[$name] );
1010                 } else {
1011                         return "&$name;";
1012                 }
1013         }
1014
1015         /**
1016          * Fetch the whitelist of acceptable attributes for a given
1017          * element name.
1018          *
1019          * @param string $element
1020          * @return array
1021          */
1022         static function attributeWhitelist( $element ) {
1023                 static $list;
1024                 if( !isset( $list ) ) {
1025                         $list = Sanitizer::setupAttributeWhitelist();
1026                 }
1027                 return isset( $list[$element] )
1028                         ? $list[$element]
1029                         : array();
1030         }
1031
1032         /**
1033          * @todo Document it a bit
1034          * @return array
1035          */
1036         static function setupAttributeWhitelist() {
1037                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1038                 $block = array_merge( $common, array( 'align' ) );
1039                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1040                 $tablecell = array( 'abbr',
1041                                     'axis',
1042                                     'headers',
1043                                     'scope',
1044                                     'rowspan',
1045                                     'colspan',
1046                                     'nowrap', # deprecated
1047                                     'width',  # deprecated
1048                                     'height', # deprecated
1049                                     'bgcolor' # deprecated
1050                                     );
1051
1052                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1053                 # See: http://www.w3.org/TR/html4/
1054                 $whitelist = array (
1055                         # 7.5.4
1056                         'div'        => $block,
1057                         'center'     => $common, # deprecated
1058                         'span'       => $block, # ??
1059
1060                         # 7.5.5
1061                         'h1'         => $block,
1062                         'h2'         => $block,
1063                         'h3'         => $block,
1064                         'h4'         => $block,
1065                         'h5'         => $block,
1066                         'h6'         => $block,
1067
1068                         # 7.5.6
1069                         # address
1070
1071                         # 8.2.4
1072                         # bdo
1073
1074                         # 9.2.1
1075                         'em'         => $common,
1076                         'strong'     => $common,
1077                         'cite'       => $common,
1078                         # dfn
1079                         'code'       => $common,
1080                         # samp
1081                         # kbd
1082                         'var'        => $common,
1083                         # abbr
1084                         # acronym
1085
1086                         # 9.2.2
1087                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1088                         # q
1089
1090                         # 9.2.3
1091                         'sub'        => $common,
1092                         'sup'        => $common,
1093
1094                         # 9.3.1
1095                         'p'          => $block,
1096
1097                         # 9.3.2
1098                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1099
1100                         # 9.3.4
1101                         'pre'        => array_merge( $common, array( 'width' ) ),
1102
1103                         # 9.4
1104                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1105                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1106
1107                         # 10.2
1108                         'ul'         => array_merge( $common, array( 'type' ) ),
1109                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1110                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1111
1112                         # 10.3
1113                         'dl'         => $common,
1114                         'dd'         => $common,
1115                         'dt'         => $common,
1116
1117                         # 11.2.1
1118                         'table'      => array_merge( $common,
1119                                                                 array( 'summary', 'width', 'border', 'frame',
1120                                                                                 'rules', 'cellspacing', 'cellpadding',
1121                                                                                 'align', 'bgcolor',
1122                                                                 ) ),
1123
1124                         # 11.2.2
1125                         'caption'    => array_merge( $common, array( 'align' ) ),
1126
1127                         # 11.2.3
1128                         'thead'      => array_merge( $common, $tablealign ),
1129                         'tfoot'      => array_merge( $common, $tablealign ),
1130                         'tbody'      => array_merge( $common, $tablealign ),
1131
1132                         # 11.2.4
1133                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1134                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1135
1136                         # 11.2.5
1137                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1138
1139                         # 11.2.6
1140                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1141                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1142
1143                         # 15.2.1
1144                         'tt'         => $common,
1145                         'b'          => $common,
1146                         'i'          => $common,
1147                         'big'        => $common,
1148                         'small'      => $common,
1149                         'strike'     => $common,
1150                         's'          => $common,
1151                         'u'          => $common,
1152
1153                         # 15.2.2
1154                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1155                         # basefont
1156
1157                         # 15.3
1158                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1159
1160                         # XHTML Ruby annotation text module, simple ruby only.
1161                         # http://www.w3c.org/TR/ruby/
1162                         'ruby'       => $common,
1163                         # rbc
1164                         # rtc
1165                         'rb'         => $common,
1166                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1167                         'rp'         => $common,
1168                         );
1169                 return $whitelist;
1170         }
1171
1172         /**
1173          * Take a fragment of (potentially invalid) HTML and return
1174          * a version with any tags removed, encoded suitably for literal
1175          * inclusion in an attribute value.
1176          *
1177          * @param string $text HTML fragment
1178          * @return string
1179          */
1180         static function stripAllTags( $text ) {
1181                 # Actual <tags>
1182                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1183
1184                 # Normalize &entities and whitespace
1185                 $text = Sanitizer::normalizeAttributeValue( $text );
1186
1187                 # Will be placed into "double-quoted" attributes,
1188                 # make sure remaining bits are safe.
1189                 $text = str_replace(
1190                         array('<', '>', '"'),
1191                         array('&lt;', '&gt;', '&quot;'),
1192                         $text );
1193
1194                 return $text;
1195         }
1196
1197         /**
1198          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1199          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1200          * PHP 5.1 doesn't.
1201          *
1202          * Use for passing XHTML fragments to PHP's XML parsing functions
1203          *
1204          * @return string
1205          * @static
1206          */
1207         static function hackDocType() {
1208                 global $wgHtmlEntities;
1209                 $out = "<!DOCTYPE html [\n";
1210                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1211                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1212                 }
1213                 $out .= "]>\n";
1214                 return $out;
1215         }
1216
1217         static function cleanUrl( $url, $hostname=true ) {
1218                 # Normalize any HTML entities in input. They will be
1219                 # re-escaped by makeExternalLink().
1220                 $url = Sanitizer::decodeCharReferences( $url );
1221
1222                 # Escape any control characters introduced by the above step
1223                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1224
1225                 # Validate hostname portion
1226                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1227                         list( $whole, $protocol, $host, $rest ) = $matches;
1228
1229                         // Characters that will be ignored in IDNs.
1230                         // http://tools.ietf.org/html/3454#section-3.1
1231                         // Strip them before further processing so blacklists and such work.
1232                         $strip = "/
1233                                 \\s|          # general whitespace
1234                                 \xc2\xad|     # 00ad SOFT HYPHEN
1235                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1236                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1237                                 \xe2\x81\xa0| # 2060 WORD JOINER
1238                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1239                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1240                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1241                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1242                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1243                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1244                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1245                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1246                                 /xuD";
1247
1248                         $host = preg_replace( $strip, '', $host );
1249
1250                         // @fixme: validate hostnames here
1251
1252                         return $protocol . $host . $rest;
1253                 } else {
1254                         return $url;
1255                 }
1256         }
1257
1258 }
1259
1260 ?>