includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr',
 355                         );
 356                         $htmllist = array( # Tags used by list
 357                                 'ul','ol',
 358                         );
 359                         $listtags = array( # Tags that can appear in a list
 360                                 'li',
 361                         );
 362
 363                 } else {
 364                         $htmlpairs = array();
 365                         $htmlsingle = array();
 366                         $htmlnest = array();
 367                         $tabletags = array();
 368                 }
 369
 370                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 371                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 372
 373                 # Remove HTML comments
 374                 $text = Sanitizer::removeHTMLcomments( $text );
 375                 $bits = explode( '<', $text );
 376                 $text = array_shift( $bits );
 377                 if(!$wgUseTidy) {
 378                         $tagstack = array(); $tablestack = array();
 379                         foreach ( $bits as $x ) {
 380                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 381                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 382                                 $x, $regs );
 383                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 384                                 error_reporting( $prev );
 385
 386                                 $badtag = 0 ;
 387                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 388                                         # Check our stack
 389                                         if ( $slash ) {
 390                                                 # Closing a tag...
 391                                                 if( in_array( $t, $htmlsingleonly ) ) {
 392                                                         $badtag = 1;
 393                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 394                                                         @array_push( $tagstack, $ot );
 395                                                         # <li> can be nested in <ul> or <ol>, skip those cases:
 396                                                         if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {
 397                                                                 $badtag = 1;
 398                                                         }
 399                                                 } else {
 400                                                         if ( $t == 'table' ) {
 401                                                                 $tagstack = array_pop( $tablestack );
 402                                                         }
 403                                                         $newparams = '';
 404                                                 }
 405                                         } else {
 406                                                 # Keep track for later
 407                                                 if ( in_array( $t, $tabletags ) &&
 408                                                 ! in_array( 'table', $tagstack ) ) {
 409                                                         $badtag = 1;
 410                                                 } else if ( in_array( $t, $tagstack ) &&
 411                                                 ! in_array ( $t , $htmlnest ) ) {
 412                                                         $badtag = 1 ;
 413                                                 # Is it a self closed htmlpair ? (bug 5487)
 414                                                 } else if( $brace == '/>' &&
 415                                                 in_array($t, $htmlpairs) ) {
 416                                                         $badtag = 1;
 417                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 418                                                         # Hack to force empty tag for uncloseable elements
 419                                                         $brace = '/>';
 420                                                 } else if( in_array( $t, $htmlsingle ) ) {
 421                                                         # Hack to not close $htmlsingle tags
 422                                                         $brace = NULL;
 423                                                 } else {
 424                                                         if ( $t == 'table' ) {
 425                                                                 array_push( $tablestack, $tagstack );
 426                                                                 $tagstack = array();
 427                                                         }
 428                                                         array_push( $tagstack, $t );
 429                                                 }
 430
 431                                                 # Replace any variables or template parameters with
 432                                                 # plaintext results.
 433                                                 if( is_callable( $processCallback ) ) {
 434                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 435                                                 }
 436
 437                                                 # Strip non-approved attributes from the tag
 438                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 439                                         }
 440                                         if ( ! $badtag ) {
 441                                                 $rest = str_replace( '>', '&gt;', $rest );
 442                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 443                                                 $text .= "<$slash$t$newparams$close>$rest";
 444                                                 continue;
 445                                         }
 446                                 }
 447                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 448                         }
 449                         # Close off any remaining tags
 450                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 451                                 $text .= "</$t>\n";
 452                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 453                         }
 454                 } else {
 455                         # this might be possible using tidy itself
 456                         foreach ( $bits as $x ) {
 457                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 458                                 $x, $regs );
 459                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 460                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 461                                         if( is_callable( $processCallback ) ) {
 462                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 463                                         }
 464                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 465                                         $rest = str_replace( '>', '&gt;', $rest );
 466                                         $text .= "<$slash$t$newparams$brace$rest";
 467                                 } else {
 468                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 469                                 }
 470                         }
 471                 }
 472                 wfProfileOut( $fname );
 473                 return $text;
 474         }
 475
 476         /**
 477          * Remove '<!--', '-->', and everything between.
 478          * To avoid leaving blank lines, when a comment is both preceded
 479          * and followed by a newline (ignoring spaces), trim leading and
 480          * trailing spaces and one of the newlines.
 481          *
 482          * @private
 483          * @param string $text
 484          * @return string
 485          */
 486         function removeHTMLcomments( $text ) {
 487                 $fname='Parser::removeHTMLcomments';
 488                 wfProfileIn( $fname );
 489                 while (($start = strpos($text, '<!--')) !== false) {
 490                         $end = strpos($text, '-->', $start + 4);
 491                         if ($end === false) {
 492                                 # Unterminated comment; bail out
 493                                 break;
 494                         }
 495
 496                         $end += 3;
 497
 498                         # Trim space and newline if the comment is both
 499                         # preceded and followed by a newline
 500                         $spaceStart = max($start - 1, 0);
 501                         $spaceLen = $end - $spaceStart;
 502                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 503                                 $spaceStart--;
 504                                 $spaceLen++;
 505                         }
 506                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 507                                 $spaceLen++;
 508                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 509                                 # Remove the comment, leading and trailing
 510                                 # spaces, and leave only one newline.
 511                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 512                         }
 513                         else {
 514                                 # Remove just the comment.
 515                                 $text = substr_replace($text, '', $start, $end - $start);
 516                         }
 517                 }
 518                 wfProfileOut( $fname );
 519                 return $text;
 520         }
 521
 522         /**
 523          * Take a tag soup fragment listing an HTML element's attributes
 524          * and normalize it to well-formed XML, discarding unwanted attributes.
 525          *
 526          * - Normalizes attribute names to lowercase
 527          * - Discards attributes not on a whitelist for the given element
 528          * - Turns broken or invalid entities into plaintext
 529          * - Double-quotes all attribute values
 530          * - Attributes without values are given the name as attribute
 531          * - Double attributes are discarded
 532          * - Unsafe style attributes are discarded
 533          * - Prepends space if there are attributes.
 534          *
 535          * @param string $text
 536          * @param string $element
 537          * @return string
 538          *
 539          * @todo Check for legal values where the DTD limits things.
 540          * @todo Check for unique id attribute :P
 541          */
 542         function fixTagAttributes( $text, $element ) {
 543                 if( trim( $text ) == '' ) {
 544                         return '';
 545                 }
 546
 547                 # Unquoted attribute
 548                 # Since we quote this later, this can be anything distinguishable
 549                 # from the end of the attribute
 550                 $pairs = array();
 551                 if( !preg_match_all(
 552                         MW_ATTRIBS_REGEX,
 553                         $text,
 554                         $pairs,
 555                         PREG_SET_ORDER ) ) {
 556                         return '';
 557                 }
 558
 559                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 560                 $attribs = array();
 561                 foreach( $pairs as $set ) {
 562                         $attribute = strtolower( $set[1] );
 563                         if( !isset( $whitelist[$attribute] ) ) {
 564                                 continue;
 565                         }
 566
 567                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 568                         $value = Sanitizer::normalizeAttributeValue( $raw );
 569
 570                         # Strip javascript "expression" from stylesheets.
 571                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 572                         if( $attribute == 'style' ) {
 573                                 $stripped = Sanitizer::decodeCharReferences( $value );
 574
 575                                 // Remove any comments; IE gets token splitting wrong
 576                                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 577                                 $value = htmlspecialchars( $stripped );
 578
 579                                 // ... and continue checks
 580                                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 581                                         'codepointToUtf8(hexdec("$1"))', $stripped );
 582                                 $stripped = str_replace( '\\', '', $stripped );
 583                                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 584                                                 $stripped ) ) {
 585                                         # haxx0r
 586                                         continue;
 587                                 }
 588                         }
 589
 590                         if ( $attribute === 'id' )
 591                                 $value = Sanitizer::escapeId( $value );
 592
 593                         # Templates and links may be expanded in later parsing,
 594                         # creating invalid or dangerous output. Suppress this.
 595                         $value = strtr( $value, array(
 596                                 '<'    => '&lt;',   // This should never happen,
 597                                 '>'    => '&gt;',   // we've received invalid input
 598                                 '"'    => '&quot;', // which should have been escaped.
 599                                 '{'    => '&#123;',
 600                                 '['    => '&#91;',
 601                                 "''"   => '&#39;&#39;',
 602                                 'ISBN' => '&#73;SBN',
 603                                 'RFC'  => '&#82;FC',
 604                                 'PMID' => '&#80;MID',
 605                                 '|'    => '&#124;',
 606                         ) );
 607
 608                         # Stupid hack
 609                         $value = preg_replace_callback(
 610                                 '/(' . wfUrlProtocols() . ')/',
 611                                 array( 'Sanitizer', 'armorLinksCallback' ),
 612                                 $value );
 613
 614                         // If this attribute was previously set, override it.
 615                         // Output should only have one attribute of each name.
 616                         $attribs[$attribute] = "$attribute=\"$value\"";
 617                 }
 618
 619                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 620         }
 621
 622         /**
 623          * Given a value escape it so that it can be used in an id attribute and
 624          * return it, this does not validate the value however (see first link)
 625          *
 626          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 627          *                                                          in the id and
 628          *                                                          name attributes
 629          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 630          *
 631          * @bug 4461
 632          *
 633          * @static
 634          *
 635          * @param string $id
 636          * @return string
 637          */
 638         function escapeId( $id ) {
 639                 static $replace = array(
 640                         '%3A' => ':',
 641                         '%' => '.'
 642                 );
 643
 644                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 645
 646                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 647         }
 648
 649         /**
 650          * Regex replace callback for armoring links against further processing.
 651          * @param array $matches
 652          * @return string
 653          * @private
 654          */
 655         function armorLinksCallback( $matches ) {
 656                 return str_replace( ':', '&#58;', $matches[1] );
 657         }
 658
 659         /**
 660          * Return an associative array of attribute names and values from
 661          * a partial tag string. Attribute names are forces to lowercase,
 662          * character references are decoded to UTF-8 text.
 663          *
 664          * @param string
 665          * @return array
 666          */
 667         function decodeTagAttributes( $text ) {
 668                 $attribs = array();
 669
 670                 if( trim( $text ) == '' ) {
 671                         return $attribs;
 672                 }
 673
 674                 $pairs = array();
 675                 if( !preg_match_all(
 676                         MW_ATTRIBS_REGEX,
 677                         $text,
 678                         $pairs,
 679                         PREG_SET_ORDER ) ) {
 680                         return $attribs;
 681                 }
 682
 683                 foreach( $pairs as $set ) {
 684                         $attribute = strtolower( $set[1] );
 685                         $value = Sanitizer::getTagAttributeCallback( $set );
 686                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 687                 }
 688                 return $attribs;
 689         }
 690
 691         /**
 692          * Pick the appropriate attribute value from a match set from the
 693          * MW_ATTRIBS_REGEX matches.
 694          *
 695          * @param array $set
 696          * @return string
 697          * @private
 698          */
 699         function getTagAttributeCallback( $set ) {
 700                 if( isset( $set[6] ) ) {
 701                         # Illegal #XXXXXX color with no quotes.
 702                         return $set[6];
 703                 } elseif( isset( $set[5] ) ) {
 704                         # No quotes.
 705                         return $set[5];
 706                 } elseif( isset( $set[4] ) ) {
 707                         # Single-quoted
 708                         return $set[4];
 709                 } elseif( isset( $set[3] ) ) {
 710                         # Double-quoted
 711                         return $set[3];
 712                 } elseif( !isset( $set[2] ) ) {
 713                         # In XHTML, attributes must have a value.
 714                         # For 'reduced' form, return explicitly the attribute name here.
 715                         return $set[1];
 716                 } else {
 717                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 718                 }
 719         }
 720
 721         /**
 722          * Normalize whitespace and character references in an XML source-
 723          * encoded text for an attribute value.
 724          *
 725          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 726          * but note that we're not returning the value, but are returning
 727          * XML source fragments that will be slapped into output.
 728          *
 729          * @param string $text
 730          * @return string
 731          * @private
 732          */
 733         function normalizeAttributeValue( $text ) {
 734                 return str_replace( '"', '&quot;',
 735                         preg_replace(
 736                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 737                                 ' ',
 738                                 Sanitizer::normalizeCharReferences( $text ) ) );
 739         }
 740
 741         /**
 742          * Ensure that any entities and character references are legal
 743          * for XML and XHTML specifically. Any stray bits will be
 744          * &amp;-escaped to result in a valid text fragment.
 745          *
 746          * a. any named char refs must be known in XHTML
 747          * b. any numeric char refs must be legal chars, not invalid or forbidden
 748          * c. use &#x, not &#X
 749          * d. fix or reject non-valid attributes
 750          *
 751          * @param string $text
 752          * @return string
 753          * @private
 754          */
 755         function normalizeCharReferences( $text ) {
 756                 return preg_replace_callback(
 757                         MW_CHAR_REFS_REGEX,
 758                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 759                         $text );
 760         }
 761         /**
 762          * @param string $matches
 763          * @return string
 764          */
 765         function normalizeCharReferencesCallback( $matches ) {
 766                 $ret = null;
 767                 if( $matches[1] != '' ) {
 768                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 769                 } elseif( $matches[2] != '' ) {
 770                         $ret = Sanitizer::decCharReference( $matches[2] );
 771                 } elseif( $matches[3] != ''  ) {
 772                         $ret = Sanitizer::hexCharReference( $matches[3] );
 773                 } elseif( $matches[4] != '' ) {
 774                         $ret = Sanitizer::hexCharReference( $matches[4] );
 775                 }
 776                 if( is_null( $ret ) ) {
 777                         return htmlspecialchars( $matches[0] );
 778                 } else {
 779                         return $ret;
 780                 }
 781         }
 782
 783         /**
 784          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 785          * return the named entity reference as is. Otherwise, returns
 786          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 787          *
 788          * @param string $name
 789          * @return string
 790          */
 791         function normalizeEntity( $name ) {
 792                 global $wgHtmlEntities;
 793                 if( isset( $wgHtmlEntities[$name] ) ) {
 794                         return "&$name;";
 795                 } else {
 796                         return "&amp;$name;";
 797                 }
 798         }
 799
 800         function decCharReference( $codepoint ) {
 801                 $point = intval( $codepoint );
 802                 if( Sanitizer::validateCodepoint( $point ) ) {
 803                         return sprintf( '&#%d;', $point );
 804                 } else {
 805                         return null;
 806                 }
 807         }
 808
 809         function hexCharReference( $codepoint ) {
 810                 $point = hexdec( $codepoint );
 811                 if( Sanitizer::validateCodepoint( $point ) ) {
 812                         return sprintf( '&#x%x;', $point );
 813                 } else {
 814                         return null;
 815                 }
 816         }
 817
 818         /**
 819          * Returns true if a given Unicode codepoint is a valid character in XML.
 820          * @param int $codepoint
 821          * @return bool
 822          */
 823         function validateCodepoint( $codepoint ) {
 824                 return ($codepoint ==    0x09)
 825                         || ($codepoint ==    0x0a)
 826                         || ($codepoint ==    0x0d)
 827                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 828                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 829                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 830         }
 831
 832         /**
 833          * Decode any character references, numeric or named entities,
 834          * in the text and return a UTF-8 string.
 835          *
 836          * @param string $text
 837          * @return string
 838          * @public
 839          */
 840         function decodeCharReferences( $text ) {
 841                 return preg_replace_callback(
 842                         MW_CHAR_REFS_REGEX,
 843                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 844                         $text );
 845         }
 846
 847         /**
 848          * @param string $matches
 849          * @return string
 850          */
 851         function decodeCharReferencesCallback( $matches ) {
 852                 if( $matches[1] != '' ) {
 853                         return Sanitizer::decodeEntity( $matches[1] );
 854                 } elseif( $matches[2] != '' ) {
 855                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 856                 } elseif( $matches[3] != ''  ) {
 857                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 858                 } elseif( $matches[4] != '' ) {
 859                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 860                 }
 861                 # Last case should be an ampersand by itself
 862                 return $matches[0];
 863         }
 864
 865         /**
 866          * Return UTF-8 string for a codepoint if that is a valid
 867          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 868          * @param int $codepoint
 869          * @return string
 870          * @private
 871          */
 872         function decodeChar( $codepoint ) {
 873                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 874                         return codepointToUtf8( $codepoint );
 875                 } else {
 876                         return UTF8_REPLACEMENT;
 877                 }
 878         }
 879
 880         /**
 881          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 882          * return the UTF-8 encoding of that character. Otherwise, returns
 883          * pseudo-entity source (eg &foo;)
 884          *
 885          * @param string $name
 886          * @return string
 887          */
 888         function decodeEntity( $name ) {
 889                 global $wgHtmlEntities;
 890                 if( isset( $wgHtmlEntities[$name] ) ) {
 891                         return codepointToUtf8( $wgHtmlEntities[$name] );
 892                 } else {
 893                         return "&$name;";
 894                 }
 895         }
 896
 897         /**
 898          * Fetch the whitelist of acceptable attributes for a given
 899          * element name.
 900          *
 901          * @param string $element
 902          * @return array
 903          */
 904         function attributeWhitelist( $element ) {
 905                 static $list;
 906                 if( !isset( $list ) ) {
 907                         $list = Sanitizer::setupAttributeWhitelist();
 908                 }
 909                 return isset( $list[$element] )
 910                         ? $list[$element]
 911                         : array();
 912         }
 913
 914         /**
 915          * @return array
 916          */
 917         function setupAttributeWhitelist() {
 918                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 919                 $block = array_merge( $common, array( 'align' ) );
 920                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 921                 $tablecell = array( 'abbr',
 922                                     'axis',
 923                                     'headers',
 924                                     'scope',
 925                                     'rowspan',
 926                                     'colspan',
 927                                     'nowrap', # deprecated
 928                                     'width',  # deprecated
 929                                     'height', # deprecated
 930                                     'bgcolor' # deprecated
 931                                     );
 932
 933                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 934                 # See: http://www.w3.org/TR/html4/
 935                 $whitelist = array (
 936                         # 7.5.4
 937                         'div'        => $block,
 938                         'center'     => $common, # deprecated
 939                         'span'       => $block, # ??
 940
 941                         # 7.5.5
 942                         'h1'         => $block,
 943                         'h2'         => $block,
 944                         'h3'         => $block,
 945                         'h4'         => $block,
 946                         'h5'         => $block,
 947                         'h6'         => $block,
 948
 949                         # 7.5.6
 950                         # address
 951
 952                         # 8.2.4
 953                         # bdo
 954
 955                         # 9.2.1
 956                         'em'         => $common,
 957                         'strong'     => $common,
 958                         'cite'       => $common,
 959                         # dfn
 960                         'code'       => $common,
 961                         # samp
 962                         # kbd
 963                         'var'        => $common,
 964                         # abbr
 965                         # acronym
 966
 967                         # 9.2.2
 968                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 969                         # q
 970
 971                         # 9.2.3
 972                         'sub'        => $common,
 973                         'sup'        => $common,
 974
 975                         # 9.3.1
 976                         'p'          => $block,
 977
 978                         # 9.3.2
 979                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 980
 981                         # 9.3.4
 982                         'pre'        => array_merge( $common, array( 'width' ) ),
 983
 984                         # 9.4
 985                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 986                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 987
 988                         # 10.2
 989                         'ul'         => array_merge( $common, array( 'type' ) ),
 990                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 991                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 992
 993                         # 10.3
 994                         'dl'         => $common,
 995                         'dd'         => $common,
 996                         'dt'         => $common,
 997
 998                         # 11.2.1
 999                         'table'      => array_merge( $common,
1000                                                                 array( 'summary', 'width', 'border', 'frame',
1001                                                                                          'rules', 'cellspacing', 'cellpadding',
1002                                                                                          'align', 'bgcolor', 'frame', 'rules',
1003                                                                                          'border' ) ),
1004
1005                         # 11.2.2
1006                         'caption'    => array_merge( $common, array( 'align' ) ),
1007
1008                         # 11.2.3
1009                         'thead'      => array_merge( $common, $tablealign ),
1010                         'tfoot'      => array_merge( $common, $tablealign ),
1011                         'tbody'      => array_merge( $common, $tablealign ),
1012
1013                         # 11.2.4
1014                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1015                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1016
1017                         # 11.2.5
1018                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1019
1020                         # 11.2.6
1021                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1022                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1023
1024                         # 15.2.1
1025                         'tt'         => $common,
1026                         'b'          => $common,
1027                         'i'          => $common,
1028                         'big'        => $common,
1029                         'small'      => $common,
1030                         'strike'     => $common,
1031                         's'          => $common,
1032                         'u'          => $common,
1033
1034                         # 15.2.2
1035                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1036                         # basefont
1037
1038                         # 15.3
1039                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1040
1041                         # XHTML Ruby annotation text module, simple ruby only.
1042                         # http://www.w3c.org/TR/ruby/
1043                         'ruby'       => $common,
1044                         # rbc
1045                         # rtc
1046                         'rb'         => $common,
1047                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1048                         'rp'         => $common,
1049                         );
1050                 return $whitelist;
1051         }
1052
1053         /**
1054          * Take a fragment of (potentially invalid) HTML and return
1055          * a version with any tags removed, encoded suitably for literal
1056          * inclusion in an attribute value.
1057          *
1058          * @param string $text HTML fragment
1059          * @return string
1060          */
1061         function stripAllTags( $text ) {
1062                 # Actual <tags>
1063                 $text = preg_replace( '/ < .*? > /x', '', $text );
1064
1065                 # Normalize &entities and whitespace
1066                 $text = Sanitizer::normalizeAttributeValue( $text );
1067
1068                 # Will be placed into "double-quoted" attributes,
1069                 # make sure remaining bits are safe.
1070                 $text = str_replace(
1071                         array('<', '>', '"'),
1072                         array('&lt;', '&gt;', '&quot;'),
1073                         $text );
1074
1075                 return $text;
1076         }
1077
1078         /**
1079          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1080          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1081          * PHP 5.1 doesn't.
1082          *
1083          * Use for passing XHTML fragments to PHP's XML parsing functions
1084          *
1085          * @return string
1086          * @static
1087          */
1088         function hackDocType() {
1089                 global $wgHtmlEntities;
1090                 $out = "<!DOCTYPE html [\n";
1091                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1092                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1093                 }
1094                 $out .= "]>\n";
1095                 return $out;
1096         }
1097
1098 }
1099
1100 ?>