includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @addtogroup Parser
  24  */
  25
  26 /**
  27  * Regular expression to match various types of character references in
  28  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  29  */
  30 define( 'MW_CHAR_REFS_REGEX',
  31         '/&([A-Za-z0-9]+);
  32          |&\#([0-9]+);
  33          |&\#x([0-9A-Za-z]+);
  34          |&\#X([0-9A-Za-z]+);
  35          |(&)/x' );
  36
  37 /**
  38  * Regular expression to match HTML/XML attribute pairs within a tag.
  39  * Allows some... latitude.
  40  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  41  */
  42 $attrib = '[A-Za-z0-9]';
  43 $space = '[\x09\x0a\x0d\x20]';
  44 define( 'MW_ATTRIBS_REGEX',
  45         "/(?:^|$space)($attrib+)
  46           ($space*=$space*
  47                 (?:
  48                  # The attribute value: quoted or alone
  49                   \"([^<\"]*)\"
  50                  | '([^<']*)'
  51                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  52                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  53                                                          # colors are specified like this.
  54                                                          # We'll be normalizing it.
  55                 )
  56            )?(?=$space|\$)/sx" );
  57
  58 /**
  59  * List of all named character entities defined in HTML 4.01
  60  * http://www.w3.org/TR/html4/sgml/entities.html
  61  * @private
  62  */
  63 global $wgHtmlEntities;
  64 $wgHtmlEntities = array(
  65         'Aacute'   => 193,
  66         'aacute'   => 225,
  67         'Acirc'    => 194,
  68         'acirc'    => 226,
  69         'acute'    => 180,
  70         'AElig'    => 198,
  71         'aelig'    => 230,
  72         'Agrave'   => 192,
  73         'agrave'   => 224,
  74         'alefsym'  => 8501,
  75         'Alpha'    => 913,
  76         'alpha'    => 945,
  77         'amp'      => 38,
  78         'and'      => 8743,
  79         'ang'      => 8736,
  80         'Aring'    => 197,
  81         'aring'    => 229,
  82         'asymp'    => 8776,
  83         'Atilde'   => 195,
  84         'atilde'   => 227,
  85         'Auml'     => 196,
  86         'auml'     => 228,
  87         'bdquo'    => 8222,
  88         'Beta'     => 914,
  89         'beta'     => 946,
  90         'brvbar'   => 166,
  91         'bull'     => 8226,
  92         'cap'      => 8745,
  93         'Ccedil'   => 199,
  94         'ccedil'   => 231,
  95         'cedil'    => 184,
  96         'cent'     => 162,
  97         'Chi'      => 935,
  98         'chi'      => 967,
  99         'circ'     => 710,
 100         'clubs'    => 9827,
 101         'cong'     => 8773,
 102         'copy'     => 169,
 103         'crarr'    => 8629,
 104         'cup'      => 8746,
 105         'curren'   => 164,
 106         'dagger'   => 8224,
 107         'Dagger'   => 8225,
 108         'darr'     => 8595,
 109         'dArr'     => 8659,
 110         'deg'      => 176,
 111         'Delta'    => 916,
 112         'delta'    => 948,
 113         'diams'    => 9830,
 114         'divide'   => 247,
 115         'Eacute'   => 201,
 116         'eacute'   => 233,
 117         'Ecirc'    => 202,
 118         'ecirc'    => 234,
 119         'Egrave'   => 200,
 120         'egrave'   => 232,
 121         'empty'    => 8709,
 122         'emsp'     => 8195,
 123         'ensp'     => 8194,
 124         'Epsilon'  => 917,
 125         'epsilon'  => 949,
 126         'equiv'    => 8801,
 127         'Eta'      => 919,
 128         'eta'      => 951,
 129         'ETH'      => 208,
 130         'eth'      => 240,
 131         'Euml'     => 203,
 132         'euml'     => 235,
 133         'euro'     => 8364,
 134         'exist'    => 8707,
 135         'fnof'     => 402,
 136         'forall'   => 8704,
 137         'frac12'   => 189,
 138         'frac14'   => 188,
 139         'frac34'   => 190,
 140         'frasl'    => 8260,
 141         'Gamma'    => 915,
 142         'gamma'    => 947,
 143         'ge'       => 8805,
 144         'gt'       => 62,
 145         'harr'     => 8596,
 146         'hArr'     => 8660,
 147         'hearts'   => 9829,
 148         'hellip'   => 8230,
 149         'Iacute'   => 205,
 150         'iacute'   => 237,
 151         'Icirc'    => 206,
 152         'icirc'    => 238,
 153         'iexcl'    => 161,
 154         'Igrave'   => 204,
 155         'igrave'   => 236,
 156         'image'    => 8465,
 157         'infin'    => 8734,
 158         'int'      => 8747,
 159         'Iota'     => 921,
 160         'iota'     => 953,
 161         'iquest'   => 191,
 162         'isin'     => 8712,
 163         'Iuml'     => 207,
 164         'iuml'     => 239,
 165         'Kappa'    => 922,
 166         'kappa'    => 954,
 167         'Lambda'   => 923,
 168         'lambda'   => 955,
 169         'lang'     => 9001,
 170         'laquo'    => 171,
 171         'larr'     => 8592,
 172         'lArr'     => 8656,
 173         'lceil'    => 8968,
 174         'ldquo'    => 8220,
 175         'le'       => 8804,
 176         'lfloor'   => 8970,
 177         'lowast'   => 8727,
 178         'loz'      => 9674,
 179         'lrm'      => 8206,
 180         'lsaquo'   => 8249,
 181         'lsquo'    => 8216,
 182         'lt'       => 60,
 183         'macr'     => 175,
 184         'mdash'    => 8212,
 185         'micro'    => 181,
 186         'middot'   => 183,
 187         'minus'    => 8722,
 188         'Mu'       => 924,
 189         'mu'       => 956,
 190         'nabla'    => 8711,
 191         'nbsp'     => 160,
 192         'ndash'    => 8211,
 193         'ne'       => 8800,
 194         'ni'       => 8715,
 195         'not'      => 172,
 196         'notin'    => 8713,
 197         'nsub'     => 8836,
 198         'Ntilde'   => 209,
 199         'ntilde'   => 241,
 200         'Nu'       => 925,
 201         'nu'       => 957,
 202         'Oacute'   => 211,
 203         'oacute'   => 243,
 204         'Ocirc'    => 212,
 205         'ocirc'    => 244,
 206         'OElig'    => 338,
 207         'oelig'    => 339,
 208         'Ograve'   => 210,
 209         'ograve'   => 242,
 210         'oline'    => 8254,
 211         'Omega'    => 937,
 212         'omega'    => 969,
 213         'Omicron'  => 927,
 214         'omicron'  => 959,
 215         'oplus'    => 8853,
 216         'or'       => 8744,
 217         'ordf'     => 170,
 218         'ordm'     => 186,
 219         'Oslash'   => 216,
 220         'oslash'   => 248,
 221         'Otilde'   => 213,
 222         'otilde'   => 245,
 223         'otimes'   => 8855,
 224         'Ouml'     => 214,
 225         'ouml'     => 246,
 226         'para'     => 182,
 227         'part'     => 8706,
 228         'permil'   => 8240,
 229         'perp'     => 8869,
 230         'Phi'      => 934,
 231         'phi'      => 966,
 232         'Pi'       => 928,
 233         'pi'       => 960,
 234         'piv'      => 982,
 235         'plusmn'   => 177,
 236         'pound'    => 163,
 237         'prime'    => 8242,
 238         'Prime'    => 8243,
 239         'prod'     => 8719,
 240         'prop'     => 8733,
 241         'Psi'      => 936,
 242         'psi'      => 968,
 243         'quot'     => 34,
 244         'radic'    => 8730,
 245         'rang'     => 9002,
 246         'raquo'    => 187,
 247         'rarr'     => 8594,
 248         'rArr'     => 8658,
 249         'rceil'    => 8969,
 250         'rdquo'    => 8221,
 251         'real'     => 8476,
 252         'reg'      => 174,
 253         'rfloor'   => 8971,
 254         'Rho'      => 929,
 255         'rho'      => 961,
 256         'rlm'      => 8207,
 257         'rsaquo'   => 8250,
 258         'rsquo'    => 8217,
 259         'sbquo'    => 8218,
 260         'Scaron'   => 352,
 261         'scaron'   => 353,
 262         'sdot'     => 8901,
 263         'sect'     => 167,
 264         'shy'      => 173,
 265         'Sigma'    => 931,
 266         'sigma'    => 963,
 267         'sigmaf'   => 962,
 268         'sim'      => 8764,
 269         'spades'   => 9824,
 270         'sub'      => 8834,
 271         'sube'     => 8838,
 272         'sum'      => 8721,
 273         'sup'      => 8835,
 274         'sup1'     => 185,
 275         'sup2'     => 178,
 276         'sup3'     => 179,
 277         'supe'     => 8839,
 278         'szlig'    => 223,
 279         'Tau'      => 932,
 280         'tau'      => 964,
 281         'there4'   => 8756,
 282         'Theta'    => 920,
 283         'theta'    => 952,
 284         'thetasym' => 977,
 285         'thinsp'   => 8201,
 286         'THORN'    => 222,
 287         'thorn'    => 254,
 288         'tilde'    => 732,
 289         'times'    => 215,
 290         'trade'    => 8482,
 291         'Uacute'   => 218,
 292         'uacute'   => 250,
 293         'uarr'     => 8593,
 294         'uArr'     => 8657,
 295         'Ucirc'    => 219,
 296         'ucirc'    => 251,
 297         'Ugrave'   => 217,
 298         'ugrave'   => 249,
 299         'uml'      => 168,
 300         'upsih'    => 978,
 301         'Upsilon'  => 933,
 302         'upsilon'  => 965,
 303         'Uuml'     => 220,
 304         'uuml'     => 252,
 305         'weierp'   => 8472,
 306         'Xi'       => 926,
 307         'xi'       => 958,
 308         'Yacute'   => 221,
 309         'yacute'   => 253,
 310         'yen'      => 165,
 311         'Yuml'     => 376,
 312         'yuml'     => 255,
 313         'Zeta'     => 918,
 314         'zeta'     => 950,
 315         'zwj'      => 8205,
 316         'zwnj'     => 8204 );
 317
 318 class Sanitizer {
 319         /**
 320          * Cleans up HTML, removes dangerous tags and attributes, and
 321          * removes HTML comments
 322          * @private
 323          * @param string $text
 324          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 325          * @param array $args for the processing callback
 326          * @return string
 327          */
 328         static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 329                 global $wgUseTidy;
 330
 331                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 332                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 333
 334                 wfProfileIn( __METHOD__ );
 335
 336                 if ( !$staticInitialised ) {
 337
 338                         $htmlpairs = array( # Tags that must be closed
 339                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 340                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 341                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 342                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 343                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 344                         );
 345                         $htmlsingle = array(
 346                                 'br', 'hr', 'li', 'dt', 'dd'
 347                         );
 348                         $htmlsingleonly = array( # Elements that cannot have close tags
 349                                 'br', 'hr'
 350                         );
 351                         $htmlnest = array( # Tags that can be nested--??
 352                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 353                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 354                         );
 355                         $tabletags = array( # Can only appear inside table, we will close them
 356                                 'td', 'th', 'tr',
 357                         );
 358                         $htmllist = array( # Tags used by list
 359                                 'ul','ol',
 360                         );
 361                         $listtags = array( # Tags that can appear in a list
 362                                 'li',
 363                         );
 364
 365                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 366                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 367
 368                         # Convert them all to hashtables for faster lookup
 369                         $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 370                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
 371                         foreach ( $vars as $var ) {
 372                                 $$var = array_flip( $$var );
 373                         }
 374                         $staticInitialised = true;
 375                 }
 376
 377                 # Remove HTML comments
 378                 $text = Sanitizer::removeHTMLcomments( $text );
 379                 $bits = explode( '<', $text );
 380                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 381                 if(!$wgUseTidy) {
 382                         $tagstack = $tablestack = array();
 383                         foreach ( $bits as $x ) {
 384                                 $regs = array();
 385                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 386                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 387                                 } else {
 388                                         $slash = $t = $params = $brace = $rest = null;
 389                                 }
 390
 391                                 $badtag = 0 ;
 392                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 393                                         # Check our stack
 394                                         if ( $slash ) {
 395                                                 # Closing a tag...
 396                                                 if( isset( $htmlsingleonly[$t] ) ) {
 397                                                         $badtag = 1;
 398                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 399                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 400                                                                 # Pop all elements with an optional close tag
 401                                                                 # and see if we find a match below them
 402                                                                 $optstack = array();
 403                                                                 array_push ($optstack, $ot);
 404                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 405                                                                                 isset( $htmlsingleallowed[$ot] ) )
 406                                                                 {
 407                                                                         array_push ($optstack, $ot);
 408                                                                 }
 409                                                                 if ( $t != $ot ) {
 410                                                                         # No match. Push the optinal elements back again
 411                                                                         $badtag = 1;
 412                                                                         while ( $ot = @array_pop( $optstack ) ) {
 413                                                                                 array_push( $tagstack, $ot );
 414                                                                         }
 415                                                                 }
 416                                                         } else {
 417                                                                 @array_push( $tagstack, $ot );
 418                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 419                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 420                                                                         $badtag = 1;
 421                                                                 }
 422                                                         }
 423                                                 } else {
 424                                                         if ( $t == 'table' ) {
 425                                                                 $tagstack = array_pop( $tablestack );
 426                                                         }
 427                                                 }
 428                                                 $newparams = '';
 429                                         } else {
 430                                                 # Keep track for later
 431                                                 if ( isset( $tabletags[$t] ) &&
 432                                                 ! in_array( 'table', $tagstack ) ) {
 433                                                         $badtag = 1;
 434                                                 } else if ( in_array( $t, $tagstack ) &&
 435                                                 ! isset( $htmlnest [$t ] ) ) {
 436                                                         $badtag = 1 ;
 437                                                 # Is it a self closed htmlpair ? (bug 5487)
 438                                                 } else if( $brace == '/>' &&
 439                                                 isset( $htmlpairs[$t] ) ) {
 440                                                         $badtag = 1;
 441                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 442                                                         # Hack to force empty tag for uncloseable elements
 443                                                         $brace = '/>';
 444                                                 } else if( isset( $htmlsingle[$t] ) ) {
 445                                                         # Hack to not close $htmlsingle tags
 446                                                         $brace = NULL;
 447                                                 } else if( isset( $tabletags[$t] )
 448                                                 &&  in_array($t ,$tagstack) ) {
 449                                                         // New table tag but forgot to close the previous one
 450                                                         $text .= "</$t>";
 451                                                 } else {
 452                                                         if ( $t == 'table' ) {
 453                                                                 array_push( $tablestack, $tagstack );
 454                                                                 $tagstack = array();
 455                                                         }
 456                                                         array_push( $tagstack, $t );
 457                                                 }
 458
 459                                                 # Replace any variables or template parameters with
 460                                                 # plaintext results.
 461                                                 if( is_callable( $processCallback ) ) {
 462                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 463                                                 }
 464
 465                                                 # Strip non-approved attributes from the tag
 466                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 467                                         }
 468                                         if ( ! $badtag ) {
 469                                                 $rest = str_replace( '>', '&gt;', $rest );
 470                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 471                                                 $text .= "<$slash$t$newparams$close>$rest";
 472                                                 continue;
 473                                         }
 474                                 }
 475                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 476                         }
 477                         # Close off any remaining tags
 478                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 479                                 $text .= "</$t>\n";
 480                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 481                         }
 482                 } else {
 483                         # this might be possible using tidy itself
 484                         foreach ( $bits as $x ) {
 485                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 486                                 $x, $regs );
 487                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 488                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 489                                         if( is_callable( $processCallback ) ) {
 490                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 491                                         }
 492                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 493                                         $rest = str_replace( '>', '&gt;', $rest );
 494                                         $text .= "<$slash$t$newparams$brace$rest";
 495                                 } else {
 496                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 497                                 }
 498                         }
 499                 }
 500                 wfProfileOut( __METHOD__ );
 501                 return $text;
 502         }
 503
 504         /**
 505          * Remove '<!--', '-->', and everything between.
 506          * To avoid leaving blank lines, when a comment is both preceded
 507          * and followed by a newline (ignoring spaces), trim leading and
 508          * trailing spaces and one of the newlines.
 509          *
 510          * @private
 511          * @param string $text
 512          * @return string
 513          */
 514         static function removeHTMLcomments( $text ) {
 515                 wfProfileIn( __METHOD__ );
 516                 while (($start = strpos($text, '<!--')) !== false) {
 517                         $end = strpos($text, '-->', $start + 4);
 518                         if ($end === false) {
 519                                 # Unterminated comment; bail out
 520                                 break;
 521                         }
 522
 523                         $end += 3;
 524
 525                         # Trim space and newline if the comment is both
 526                         # preceded and followed by a newline
 527                         $spaceStart = max($start - 1, 0);
 528                         $spaceLen = $end - $spaceStart;
 529                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 530                                 $spaceStart--;
 531                                 $spaceLen++;
 532                         }
 533                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 534                                 $spaceLen++;
 535                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 536                                 # Remove the comment, leading and trailing
 537                                 # spaces, and leave only one newline.
 538                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 539                         }
 540                         else {
 541                                 # Remove just the comment.
 542                                 $text = substr_replace($text, '', $start, $end - $start);
 543                         }
 544                 }
 545                 wfProfileOut( __METHOD__ );
 546                 return $text;
 547         }
 548
 549         /**
 550          * Take an array of attribute names and values and normalize or discard
 551          * illegal values for the given element type.
 552          *
 553          * - Discards attributes not on a whitelist for the given element
 554          * - Unsafe style attributes are discarded
 555          *
 556          * @param array $attribs
 557          * @param string $element
 558          * @return array
 559          *
 560          * @todo Check for legal values where the DTD limits things.
 561          * @todo Check for unique id attribute :P
 562          */
 563         static function validateTagAttributes( $attribs, $element ) {
 564                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 565                 $out = array();
 566                 foreach( $attribs as $attribute => $value ) {
 567                         if( !isset( $whitelist[$attribute] ) ) {
 568                                 continue;
 569                         }
 570                         # Strip javascript "expression" from stylesheets.
 571                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 572                         if( $attribute == 'style' ) {
 573                                 $value = Sanitizer::checkCss( $value );
 574                                 if( $value === false ) {
 575                                         # haxx0r
 576                                         continue;
 577                                 }
 578                         }
 579
 580                         if ( $attribute === 'id' )
 581                                 $value = Sanitizer::escapeId( $value );
 582
 583                         // If this attribute was previously set, override it.
 584                         // Output should only have one attribute of each name.
 585                         $out[$attribute] = $value;
 586                 }
 587                 return $out;
 588         }
 589
 590         /**
 591          * Pick apart some CSS and check it for forbidden or unsafe structures.
 592          * Returns a sanitized string, or false if it was just too evil.
 593          *
 594          * Currently URL references, 'expression', 'tps' are forbidden.
 595          *
 596          * @param string $value
 597          * @return mixed
 598          */
 599         static function checkCss( $value ) {
 600                 $stripped = Sanitizer::decodeCharReferences( $value );
 601
 602                 // Remove any comments; IE gets token splitting wrong
 603                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 604
 605                 $value = $stripped;
 606
 607                 // ... and continue checks
 608                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 609                         'codepointToUtf8(hexdec("$1"))', $stripped );
 610                 $stripped = str_replace( '\\', '', $stripped );
 611                 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
 612                                 $stripped ) ) {
 613                         # haxx0r
 614                         return false;
 615                 }
 616
 617                 return $value;
 618         }
 619
 620         /**
 621          * Take a tag soup fragment listing an HTML element's attributes
 622          * and normalize it to well-formed XML, discarding unwanted attributes.
 623          * Output is safe for further wikitext processing, with escaping of
 624          * values that could trigger problems.
 625          *
 626          * - Normalizes attribute names to lowercase
 627          * - Discards attributes not on a whitelist for the given element
 628          * - Turns broken or invalid entities into plaintext
 629          * - Double-quotes all attribute values
 630          * - Attributes without values are given the name as attribute
 631          * - Double attributes are discarded
 632          * - Unsafe style attributes are discarded
 633          * - Prepends space if there are attributes.
 634          *
 635          * @param string $text
 636          * @param string $element
 637          * @return string
 638          */
 639         static function fixTagAttributes( $text, $element ) {
 640                 if( trim( $text ) == '' ) {
 641                         return '';
 642                 }
 643
 644                 $stripped = Sanitizer::validateTagAttributes(
 645                         Sanitizer::decodeTagAttributes( $text ), $element );
 646
 647                 $attribs = array();
 648                 foreach( $stripped as $attribute => $value ) {
 649                         $encAttribute = htmlspecialchars( $attribute );
 650                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 651
 652                         $attribs[] = "$encAttribute=\"$encValue\"";
 653                 }
 654                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 655         }
 656
 657         /**
 658          * Encode an attribute value for HTML output.
 659          * @param $text
 660          * @return HTML-encoded text fragment
 661          */
 662         static function encodeAttribute( $text ) {
 663                 $encValue = htmlspecialchars( $text );
 664
 665                 // Whitespace is normalized during attribute decoding,
 666                 // so if we've been passed non-spaces we must encode them
 667                 // ahead of time or they won't be preserved.
 668                 $encValue = strtr( $encValue, array(
 669                         "\n" => '&#10;',
 670                         "\r" => '&#13;',
 671                         "\t" => '&#9;',
 672                 ) );
 673
 674                 return $encValue;
 675         }
 676
 677         /**
 678          * Encode an attribute value for HTML tags, with extra armoring
 679          * against further wiki processing.
 680          * @param $text
 681          * @return HTML-encoded text fragment
 682          */
 683         static function safeEncodeAttribute( $text ) {
 684                 $encValue = Sanitizer::encodeAttribute( $text );
 685
 686                 # Templates and links may be expanded in later parsing,
 687                 # creating invalid or dangerous output. Suppress this.
 688                 $encValue = strtr( $encValue, array(
 689                         '<'    => '&lt;',   // This should never happen,
 690                         '>'    => '&gt;',   // we've received invalid input
 691                         '"'    => '&quot;', // which should have been escaped.
 692                         '{'    => '&#123;',
 693                         '['    => '&#91;',
 694                         "''"   => '&#39;&#39;',
 695                         'ISBN' => '&#73;SBN',
 696                         'RFC'  => '&#82;FC',
 697                         'PMID' => '&#80;MID',
 698                         '|'    => '&#124;',
 699                         '__'   => '&#95;_',
 700                 ) );
 701
 702                 # Stupid hack
 703                 $encValue = preg_replace_callback(
 704                         '/(' . wfUrlProtocols() . ')/',
 705                         array( 'Sanitizer', 'armorLinksCallback' ),
 706                         $encValue );
 707                 return $encValue;
 708         }
 709
 710         /**
 711          * Given a value escape it so that it can be used in an id attribute and
 712          * return it, this does not validate the value however (see first link)
 713          *
 714          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 715          *                                                          in the id and
 716          *                                                          name attributes
 717          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 718          *
 719          * @static
 720          *
 721          * @param string $id
 722          * @return string
 723          */
 724         static function escapeId( $id ) {
 725                 static $replace = array(
 726                         '%3A' => ':',
 727                         '%' => '.'
 728                 );
 729
 730                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 731
 732                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 733         }
 734
 735         /**
 736          * Given a value, escape it so that it can be used as a CSS class and
 737          * return it.
 738          *
 739          * @todo For extra validity, input should be validated UTF-8.
 740          *
 741          * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 742          *
 743          * @param string $class
 744          * @return string
 745          */
 746         static function escapeClass( $class ) {
 747                 // Convert ugly stuff to underscores and kill underscores in ugly places
 748                 return rtrim(preg_replace(
 749                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 750                         '_',
 751                         $class ), '_');
 752         }
 753
 754         /**
 755          * Regex replace callback for armoring links against further processing.
 756          * @param array $matches
 757          * @return string
 758          * @private
 759          */
 760         private static function armorLinksCallback( $matches ) {
 761                 return str_replace( ':', '&#58;', $matches[1] );
 762         }
 763
 764         /**
 765          * Return an associative array of attribute names and values from
 766          * a partial tag string. Attribute names are forces to lowercase,
 767          * character references are decoded to UTF-8 text.
 768          *
 769          * @param string
 770          * @return array
 771          */
 772         static function decodeTagAttributes( $text ) {
 773                 $attribs = array();
 774
 775                 if( trim( $text ) == '' ) {
 776                         return $attribs;
 777                 }
 778
 779                 $pairs = array();
 780                 if( !preg_match_all(
 781                         MW_ATTRIBS_REGEX,
 782                         $text,
 783                         $pairs,
 784                         PREG_SET_ORDER ) ) {
 785                         return $attribs;
 786                 }
 787
 788                 foreach( $pairs as $set ) {
 789                         $attribute = strtolower( $set[1] );
 790                         $value = Sanitizer::getTagAttributeCallback( $set );
 791
 792                         // Normalize whitespace
 793                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 794                         $value = trim( $value );
 795
 796                         // Decode character references
 797                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 798                 }
 799                 return $attribs;
 800         }
 801
 802         /**
 803          * Pick the appropriate attribute value from a match set from the
 804          * MW_ATTRIBS_REGEX matches.
 805          *
 806          * @param array $set
 807          * @return string
 808          * @private
 809          */
 810         private static function getTagAttributeCallback( $set ) {
 811                 if( isset( $set[6] ) ) {
 812                         # Illegal #XXXXXX color with no quotes.
 813                         return $set[6];
 814                 } elseif( isset( $set[5] ) ) {
 815                         # No quotes.
 816                         return $set[5];
 817                 } elseif( isset( $set[4] ) ) {
 818                         # Single-quoted
 819                         return $set[4];
 820                 } elseif( isset( $set[3] ) ) {
 821                         # Double-quoted
 822                         return $set[3];
 823                 } elseif( !isset( $set[2] ) ) {
 824                         # In XHTML, attributes must have a value.
 825                         # For 'reduced' form, return explicitly the attribute name here.
 826                         return $set[1];
 827                 } else {
 828                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 829                 }
 830         }
 831
 832         /**
 833          * Normalize whitespace and character references in an XML source-
 834          * encoded text for an attribute value.
 835          *
 836          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 837          * but note that we're not returning the value, but are returning
 838          * XML source fragments that will be slapped into output.
 839          *
 840          * @param string $text
 841          * @return string
 842          * @private
 843          */
 844         private static function normalizeAttributeValue( $text ) {
 845                 return str_replace( '"', '&quot;',
 846                         preg_replace(
 847                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 848                                 ' ',
 849                                 Sanitizer::normalizeCharReferences( $text ) ) );
 850         }
 851
 852         /**
 853          * Ensure that any entities and character references are legal
 854          * for XML and XHTML specifically. Any stray bits will be
 855          * &amp;-escaped to result in a valid text fragment.
 856          *
 857          * a. any named char refs must be known in XHTML
 858          * b. any numeric char refs must be legal chars, not invalid or forbidden
 859          * c. use &#x, not &#X
 860          * d. fix or reject non-valid attributes
 861          *
 862          * @param string $text
 863          * @return string
 864          * @private
 865          */
 866         static function normalizeCharReferences( $text ) {
 867                 return preg_replace_callback(
 868                         MW_CHAR_REFS_REGEX,
 869                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 870                         $text );
 871         }
 872         /**
 873          * @param string $matches
 874          * @return string
 875          */
 876         static function normalizeCharReferencesCallback( $matches ) {
 877                 $ret = null;
 878                 if( $matches[1] != '' ) {
 879                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 880                 } elseif( $matches[2] != '' ) {
 881                         $ret = Sanitizer::decCharReference( $matches[2] );
 882                 } elseif( $matches[3] != ''  ) {
 883                         $ret = Sanitizer::hexCharReference( $matches[3] );
 884                 } elseif( $matches[4] != '' ) {
 885                         $ret = Sanitizer::hexCharReference( $matches[4] );
 886                 }
 887                 if( is_null( $ret ) ) {
 888                         return htmlspecialchars( $matches[0] );
 889                 } else {
 890                         return $ret;
 891                 }
 892         }
 893
 894         /**
 895          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 896          * return the named entity reference as is. Otherwise, returns
 897          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 898          *
 899          * @param string $name
 900          * @return string
 901          * @static
 902          */
 903         static function normalizeEntity( $name ) {
 904                 global $wgHtmlEntities;
 905                 if( isset( $wgHtmlEntities[$name] ) ) {
 906                         return "&$name;";
 907                 } else {
 908                         return "&amp;$name;";
 909                 }
 910         }
 911
 912         static function decCharReference( $codepoint ) {
 913                 $point = intval( $codepoint );
 914                 if( Sanitizer::validateCodepoint( $point ) ) {
 915                         return sprintf( '&#%d;', $point );
 916                 } else {
 917                         return null;
 918                 }
 919         }
 920
 921         static function hexCharReference( $codepoint ) {
 922                 $point = hexdec( $codepoint );
 923                 if( Sanitizer::validateCodepoint( $point ) ) {
 924                         return sprintf( '&#x%x;', $point );
 925                 } else {
 926                         return null;
 927                 }
 928         }
 929
 930         /**
 931          * Returns true if a given Unicode codepoint is a valid character in XML.
 932          * @param int $codepoint
 933          * @return bool
 934          */
 935         private static function validateCodepoint( $codepoint ) {
 936                 return ($codepoint ==    0x09)
 937                         || ($codepoint ==    0x0a)
 938                         || ($codepoint ==    0x0d)
 939                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 940                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 941                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 942         }
 943
 944         /**
 945          * Decode any character references, numeric or named entities,
 946          * in the text and return a UTF-8 string.
 947          *
 948          * @param string $text
 949          * @return string
 950          * @public
 951          * @static
 952          */
 953         public static function decodeCharReferences( $text ) {
 954                 return preg_replace_callback(
 955                         MW_CHAR_REFS_REGEX,
 956                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 957                         $text );
 958         }
 959
 960         /**
 961          * @param string $matches
 962          * @return string
 963          */
 964         static function decodeCharReferencesCallback( $matches ) {
 965                 if( $matches[1] != '' ) {
 966                         return Sanitizer::decodeEntity( $matches[1] );
 967                 } elseif( $matches[2] != '' ) {
 968                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 969                 } elseif( $matches[3] != ''  ) {
 970                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 971                 } elseif( $matches[4] != '' ) {
 972                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 973                 }
 974                 # Last case should be an ampersand by itself
 975                 return $matches[0];
 976         }
 977
 978         /**
 979          * Return UTF-8 string for a codepoint if that is a valid
 980          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 981          * @param int $codepoint
 982          * @return string
 983          * @private
 984          */
 985         static function decodeChar( $codepoint ) {
 986                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 987                         return codepointToUtf8( $codepoint );
 988                 } else {
 989                         return UTF8_REPLACEMENT;
 990                 }
 991         }
 992
 993         /**
 994          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 995          * return the UTF-8 encoding of that character. Otherwise, returns
 996          * pseudo-entity source (eg &foo;)
 997          *
 998          * @param string $name
 999          * @return string
1000          */
1001         static function decodeEntity( $name ) {
1002                 global $wgHtmlEntities;
1003                 if( isset( $wgHtmlEntities[$name] ) ) {
1004                         return codepointToUtf8( $wgHtmlEntities[$name] );
1005                 } else {
1006                         return "&$name;";
1007                 }
1008         }
1009
1010         /**
1011          * Fetch the whitelist of acceptable attributes for a given
1012          * element name.
1013          *
1014          * @param string $element
1015          * @return array
1016          */
1017         static function attributeWhitelist( $element ) {
1018                 static $list;
1019                 if( !isset( $list ) ) {
1020                         $list = Sanitizer::setupAttributeWhitelist();
1021                 }
1022                 return isset( $list[$element] )
1023                         ? $list[$element]
1024                         : array();
1025         }
1026
1027         /**
1028          * @todo Document it a bit
1029          * @return array
1030          */
1031         static function setupAttributeWhitelist() {
1032                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1033                 $block = array_merge( $common, array( 'align' ) );
1034                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1035                 $tablecell = array( 'abbr',
1036                                     'axis',
1037                                     'headers',
1038                                     'scope',
1039                                     'rowspan',
1040                                     'colspan',
1041                                     'nowrap', # deprecated
1042                                     'width',  # deprecated
1043                                     'height', # deprecated
1044                                     'bgcolor' # deprecated
1045                                     );
1046
1047                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1048                 # See: http://www.w3.org/TR/html4/
1049                 $whitelist = array (
1050                         # 7.5.4
1051                         'div'        => $block,
1052                         'center'     => $common, # deprecated
1053                         'span'       => $block, # ??
1054
1055                         # 7.5.5
1056                         'h1'         => $block,
1057                         'h2'         => $block,
1058                         'h3'         => $block,
1059                         'h4'         => $block,
1060                         'h5'         => $block,
1061                         'h6'         => $block,
1062
1063                         # 7.5.6
1064                         # address
1065
1066                         # 8.2.4
1067                         # bdo
1068
1069                         # 9.2.1
1070                         'em'         => $common,
1071                         'strong'     => $common,
1072                         'cite'       => $common,
1073                         # dfn
1074                         'code'       => $common,
1075                         # samp
1076                         # kbd
1077                         'var'        => $common,
1078                         # abbr
1079                         # acronym
1080
1081                         # 9.2.2
1082                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1083                         # q
1084
1085                         # 9.2.3
1086                         'sub'        => $common,
1087                         'sup'        => $common,
1088
1089                         # 9.3.1
1090                         'p'          => $block,
1091
1092                         # 9.3.2
1093                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1094
1095                         # 9.3.4
1096                         'pre'        => array_merge( $common, array( 'width' ) ),
1097
1098                         # 9.4
1099                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1100                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1101
1102                         # 10.2
1103                         'ul'         => array_merge( $common, array( 'type' ) ),
1104                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1105                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1106
1107                         # 10.3
1108                         'dl'         => $common,
1109                         'dd'         => $common,
1110                         'dt'         => $common,
1111
1112                         # 11.2.1
1113                         'table'      => array_merge( $common,
1114                                                                 array( 'summary', 'width', 'border', 'frame',
1115                                                                                 'rules', 'cellspacing', 'cellpadding',
1116                                                                                 'align', 'bgcolor',
1117                                                                 ) ),
1118
1119                         # 11.2.2
1120                         'caption'    => array_merge( $common, array( 'align' ) ),
1121
1122                         # 11.2.3
1123                         'thead'      => array_merge( $common, $tablealign ),
1124                         'tfoot'      => array_merge( $common, $tablealign ),
1125                         'tbody'      => array_merge( $common, $tablealign ),
1126
1127                         # 11.2.4
1128                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1129                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1130
1131                         # 11.2.5
1132                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1133
1134                         # 11.2.6
1135                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1136                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1137
1138                         # 15.2.1
1139                         'tt'         => $common,
1140                         'b'          => $common,
1141                         'i'          => $common,
1142                         'big'        => $common,
1143                         'small'      => $common,
1144                         'strike'     => $common,
1145                         's'          => $common,
1146                         'u'          => $common,
1147
1148                         # 15.2.2
1149                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1150                         # basefont
1151
1152                         # 15.3
1153                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1154
1155                         # XHTML Ruby annotation text module, simple ruby only.
1156                         # http://www.w3c.org/TR/ruby/
1157                         'ruby'       => $common,
1158                         # rbc
1159                         # rtc
1160                         'rb'         => $common,
1161                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1162                         'rp'         => $common,
1163                         );
1164                 return $whitelist;
1165         }
1166
1167         /**
1168          * Take a fragment of (potentially invalid) HTML and return
1169          * a version with any tags removed, encoded suitably for literal
1170          * inclusion in an attribute value.
1171          *
1172          * @param string $text HTML fragment
1173          * @return string
1174          */
1175         static function stripAllTags( $text ) {
1176                 # Actual <tags>
1177                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1178
1179                 # Normalize &entities and whitespace
1180                 $text = Sanitizer::normalizeAttributeValue( $text );
1181
1182                 # Will be placed into "double-quoted" attributes,
1183                 # make sure remaining bits are safe.
1184                 $text = str_replace(
1185                         array('<', '>', '"'),
1186                         array('&lt;', '&gt;', '&quot;'),
1187                         $text );
1188
1189                 return $text;
1190         }
1191
1192         /**
1193          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1194          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1195          * PHP 5.1 doesn't.
1196          *
1197          * Use for passing XHTML fragments to PHP's XML parsing functions
1198          *
1199          * @return string
1200          * @static
1201          */
1202         static function hackDocType() {
1203                 global $wgHtmlEntities;
1204                 $out = "<!DOCTYPE html [\n";
1205                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1206                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1207                 }
1208                 $out .= "]>\n";
1209                 return $out;
1210         }
1211
1212         static function cleanUrl( $url, $hostname=true ) {
1213                 # Normalize any HTML entities in input. They will be
1214                 # re-escaped by makeExternalLink().
1215                 $url = Sanitizer::decodeCharReferences( $url );
1216
1217                 # Escape any control characters introduced by the above step
1218                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1219
1220                 # Validate hostname portion
1221                 $matches = array();
1222                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1223                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1224
1225                         // Characters that will be ignored in IDNs.
1226                         // http://tools.ietf.org/html/3454#section-3.1
1227                         // Strip them before further processing so blacklists and such work.
1228                         $strip = "/
1229                                 \\s|          # general whitespace
1230                                 \xc2\xad|     # 00ad SOFT HYPHEN
1231                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1232                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1233                                 \xe2\x81\xa0| # 2060 WORD JOINER
1234                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1235                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1236                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1237                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1238                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1239                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1240                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1241                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1242                                 /xuD";
1243
1244                         $host = preg_replace( $strip, '', $host );
1245
1246                         // @fixme: validate hostnames here
1247
1248                         return $protocol . $host . $rest;
1249                 } else {
1250                         return $url;
1251                 }
1252         }
1253
1254 }
1255
1256 ?>