includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr'
 355                         );
 356                 } else {
 357                         $htmlpairs = array();
 358                         $htmlsingle = array();
 359                         $htmlnest = array();
 360                         $tabletags = array();
 361                 }
 362
 363                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 364                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 365
 366                 # Remove HTML comments
 367                 $text = Sanitizer::removeHTMLcomments( $text );
 368
 369                 $bits = explode( '<', $text );
 370                 $text = array_shift( $bits );
 371                 if(!$wgUseTidy) {
 372                         $tagstack = array(); $tablestack = array();
 373                         foreach ( $bits as $x ) {
 374                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 375                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 376                                 $x, $regs );
 377                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 378                                 error_reporting( $prev );
 379
 380                                 $badtag = 0 ;
 381                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 382                                         # Check our stack
 383                                         if ( $slash ) {
 384                                                 # Closing a tag...
 385                                                 if( in_array( $t, $htmlsingleonly ) ) {
 386                                                         $badtag = 1;
 387                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 388                                                         @array_push( $tagstack, $ot );
 389                                                         $badtag = 1;
 390                                                 } else {
 391                                                         if ( $t == 'table' ) {
 392                                                                 $tagstack = array_pop( $tablestack );
 393                                                         }
 394                                                         $newparams = '';
 395                                                 }
 396                                         } else {
 397                                                 # Keep track for later
 398                                                 if ( in_array( $t, $tabletags ) &&
 399                                                 ! in_array( 'table', $tagstack ) ) {
 400                                                         $badtag = 1;
 401                                                 } else if ( in_array( $t, $tagstack ) &&
 402                                                 ! in_array ( $t , $htmlnest ) ) {
 403                                                         $badtag = 1 ;
 404                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 405                                                         # Hack to force empty tag for uncloseable elements
 406                                                         $brace = '/>';
 407                                                 } else if( in_array( $t, $htmlsingle ) ) {
 408                                                         # Hack to not close $htmlsingle tags
 409                                                         $brace = NULL;
 410                                                 } else {
 411                                                         if ( $t == 'table' ) {
 412                                                                 array_push( $tablestack, $tagstack );
 413                                                                 $tagstack = array();
 414                                                         }
 415                                                         array_push( $tagstack, $t );
 416                                                 }
 417
 418                                                 # Replace any variables or template parameters with
 419                                                 # plaintext results.
 420                                                 if( is_callable( $processCallback ) ) {
 421                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 422                                                 }
 423
 424                                                 # Strip non-approved attributes from the tag
 425                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 426                                         }
 427                                         if ( ! $badtag ) {
 428                                                 $rest = str_replace( '>', '&gt;', $rest );
 429                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 430                                                 $text .= "<$slash$t$newparams$close>$rest";
 431                                                 continue;
 432                                         }
 433                                 }
 434                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 435                         }
 436                         # Close off any remaining tags
 437                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 438                                 $text .= "</$t>\n";
 439                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 440                         }
 441                 } else {
 442                         # this might be possible using tidy itself
 443                         foreach ( $bits as $x ) {
 444                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 445                                 $x, $regs );
 446                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 447                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 448                                         if( is_callable( $processCallback ) ) {
 449                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 450                                         }
 451                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 452                                         $rest = str_replace( '>', '&gt;', $rest );
 453                                         $text .= "<$slash$t$newparams$brace$rest";
 454                                 } else {
 455                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 456                                 }
 457                         }
 458                 }
 459                 wfProfileOut( $fname );
 460                 return $text;
 461         }
 462
 463         /**
 464          * Remove '<!--', '-->', and everything between.
 465          * To avoid leaving blank lines, when a comment is both preceded
 466          * and followed by a newline (ignoring spaces), trim leading and
 467          * trailing spaces and one of the newlines.
 468          *
 469          * @private
 470          * @param string $text
 471          * @return string
 472          */
 473         function removeHTMLcomments( $text ) {
 474                 $fname='Parser::removeHTMLcomments';
 475                 wfProfileIn( $fname );
 476                 while (($start = strpos($text, '<!--')) !== false) {
 477                         $end = strpos($text, '-->', $start + 4);
 478                         if ($end === false) {
 479                                 # Unterminated comment; bail out
 480                                 break;
 481                         }
 482
 483                         $end += 3;
 484
 485                         # Trim space and newline if the comment is both
 486                         # preceded and followed by a newline
 487                         $spaceStart = max($start - 1, 0);
 488                         $spaceLen = $end - $spaceStart;
 489                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 490                                 $spaceStart--;
 491                                 $spaceLen++;
 492                         }
 493                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 494                                 $spaceLen++;
 495                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 496                                 # Remove the comment, leading and trailing
 497                                 # spaces, and leave only one newline.
 498                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 499                         }
 500                         else {
 501                                 # Remove just the comment.
 502                                 $text = substr_replace($text, '', $start, $end - $start);
 503                         }
 504                 }
 505                 wfProfileOut( $fname );
 506                 return $text;
 507         }
 508
 509         /**
 510          * Take a tag soup fragment listing an HTML element's attributes
 511          * and normalize it to well-formed XML, discarding unwanted attributes.
 512          *
 513          * - Normalizes attribute names to lowercase
 514          * - Discards attributes not on a whitelist for the given element
 515          * - Turns broken or invalid entities into plaintext
 516          * - Double-quotes all attribute values
 517          * - Attributes without values are given the name as attribute
 518          * - Double attributes are discarded
 519          * - Unsafe style attributes are discarded
 520          * - Prepends space if there are attributes.
 521          *
 522          * @param string $text
 523          * @param string $element
 524          * @return string
 525          *
 526          * @todo Check for legal values where the DTD limits things.
 527          * @todo Check for unique id attribute :P
 528          */
 529         function fixTagAttributes( $text, $element ) {
 530                 if( trim( $text ) == '' ) {
 531                         return '';
 532                 }
 533
 534                 # Unquoted attribute
 535                 # Since we quote this later, this can be anything distinguishable
 536                 # from the end of the attribute
 537                 $pairs = array();
 538                 if( !preg_match_all(
 539                         MW_ATTRIBS_REGEX,
 540                         $text,
 541                         $pairs,
 542                         PREG_SET_ORDER ) ) {
 543                         return '';
 544                 }
 545
 546                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 547                 $attribs = array();
 548                 foreach( $pairs as $set ) {
 549                         $attribute = strtolower( $set[1] );
 550                         if( !isset( $whitelist[$attribute] ) ) {
 551                                 continue;
 552                         }
 553
 554                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 555                         $value = Sanitizer::normalizeAttributeValue( $raw );
 556
 557                         # Strip javascript "expression" from stylesheets.
 558                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 559                         if( $attribute == 'style' ) {
 560                                 $stripped = Sanitizer::decodeCharReferences( $value );
 561
 562                                 // Remove any comments; IE gets token splitting wrong
 563                                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 564                                 $value = htmlspecialchars( $stripped );
 565
 566                                 // ... and continue checks
 567                                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 568                                         'codepointToUtf8(hexdec("$1"))', $stripped );
 569                                 $stripped = str_replace( '\\', '', $stripped );
 570                                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 571                                                 $stripped ) ) {
 572                                         # haxx0r
 573                                         continue;
 574                                 }
 575                         }
 576
 577                         if ( $attribute === 'id' )
 578                                 $value = Sanitizer::escapeId( $value );
 579
 580                         # Templates and links may be expanded in later parsing,
 581                         # creating invalid or dangerous output. Suppress this.
 582                         $value = strtr( $value, array(
 583                                 '<'    => '&lt;',   // This should never happen,
 584                                 '>'    => '&gt;',   // we've received invalid input
 585                                 '"'    => '&quot;', // which should have been escaped.
 586                                 '{'    => '&#123;',
 587                                 '['    => '&#91;',
 588                                 "''"   => '&#39;&#39;',
 589                                 'ISBN' => '&#73;SBN',
 590                                 'RFC'  => '&#82;FC',
 591                                 'PMID' => '&#80;MID',
 592                         ) );
 593
 594                         # Stupid hack
 595                         $value = preg_replace_callback(
 596                                 '/(' . wfUrlProtocols() . ')/',
 597                                 array( 'Sanitizer', 'armorLinksCallback' ),
 598                                 $value );
 599
 600                         // If this attribute was previously set, override it.
 601                         // Output should only have one attribute of each name.
 602                         $attribs[$attribute] = "$attribute=\"$value\"";
 603                 }
 604
 605                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 606         }
 607
 608         /**
 609          * Given a value escape it so that it can be used in an id attribute and
 610          * return it, this does not validate the value however (see first link)
 611          *
 612          * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
 613          *                                                          in the id and
 614          *                                                          name attributes
 615          * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 616          *
 617          * @bug 4461
 618          *
 619          * @static
 620          *
 621          * @param string $id
 622          * @return string
 623          */
 624         function escapeId( $id ) {
 625                 static $replace = array(
 626                         '%3A' => ':',
 627                         '%' => '.'
 628                 );
 629
 630                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 631
 632                 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
 633         }
 634
 635         /**
 636          * Regex replace callback for armoring links against further processing.
 637          * @param array $matches
 638          * @return string
 639          * @private
 640          */
 641         function armorLinksCallback( $matches ) {
 642                 return str_replace( ':', '&#58;', $matches[1] );
 643         }
 644
 645         /**
 646          * Return an associative array of attribute names and values from
 647          * a partial tag string. Attribute names are forces to lowercase,
 648          * character references are decoded to UTF-8 text.
 649          *
 650          * @param string
 651          * @return array
 652          */
 653         function decodeTagAttributes( $text ) {
 654                 $attribs = array();
 655
 656                 if( trim( $text ) == '' ) {
 657                         return $attribs;
 658                 }
 659
 660                 $pairs = array();
 661                 if( !preg_match_all(
 662                         MW_ATTRIBS_REGEX,
 663                         $text,
 664                         $pairs,
 665                         PREG_SET_ORDER ) ) {
 666                         return $attribs;
 667                 }
 668
 669                 foreach( $pairs as $set ) {
 670                         $attribute = strtolower( $set[1] );
 671                         $value = Sanitizer::getTagAttributeCallback( $set );
 672                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 673                 }
 674                 return $attribs;
 675         }
 676
 677         /**
 678          * Pick the appropriate attribute value from a match set from the
 679          * MW_ATTRIBS_REGEX matches.
 680          *
 681          * @param array $set
 682          * @return string
 683          * @private
 684          */
 685         function getTagAttributeCallback( $set ) {
 686                 if( isset( $set[6] ) ) {
 687                         # Illegal #XXXXXX color with no quotes.
 688                         return $set[6];
 689                 } elseif( isset( $set[5] ) ) {
 690                         # No quotes.
 691                         return $set[5];
 692                 } elseif( isset( $set[4] ) ) {
 693                         # Single-quoted
 694                         return $set[4];
 695                 } elseif( isset( $set[3] ) ) {
 696                         # Double-quoted
 697                         return $set[3];
 698                 } elseif( !isset( $set[2] ) ) {
 699                         # In XHTML, attributes must have a value.
 700                         # For 'reduced' form, return explicitly the attribute name here.
 701                         return $set[1];
 702                 } else {
 703                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 704                 }
 705         }
 706
 707         /**
 708          * Normalize whitespace and character references in an XML source-
 709          * encoded text for an attribute value.
 710          *
 711          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 712          * but note that we're not returning the value, but are returning
 713          * XML source fragments that will be slapped into output.
 714          *
 715          * @param string $text
 716          * @return string
 717          * @private
 718          */
 719         function normalizeAttributeValue( $text ) {
 720                 return str_replace( '"', '&quot;',
 721                         preg_replace(
 722                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 723                                 ' ',
 724                                 Sanitizer::normalizeCharReferences( $text ) ) );
 725         }
 726
 727         /**
 728          * Ensure that any entities and character references are legal
 729          * for XML and XHTML specifically. Any stray bits will be
 730          * &amp;-escaped to result in a valid text fragment.
 731          *
 732          * a. any named char refs must be known in XHTML
 733          * b. any numeric char refs must be legal chars, not invalid or forbidden
 734          * c. use &#x, not &#X
 735          * d. fix or reject non-valid attributes
 736          *
 737          * @param string $text
 738          * @return string
 739          * @private
 740          */
 741         function normalizeCharReferences( $text ) {
 742                 return preg_replace_callback(
 743                         MW_CHAR_REFS_REGEX,
 744                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 745                         $text );
 746         }
 747         /**
 748          * @param string $matches
 749          * @return string
 750          */
 751         function normalizeCharReferencesCallback( $matches ) {
 752                 $ret = null;
 753                 if( $matches[1] != '' ) {
 754                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 755                 } elseif( $matches[2] != '' ) {
 756                         $ret = Sanitizer::decCharReference( $matches[2] );
 757                 } elseif( $matches[3] != ''  ) {
 758                         $ret = Sanitizer::hexCharReference( $matches[3] );
 759                 } elseif( $matches[4] != '' ) {
 760                         $ret = Sanitizer::hexCharReference( $matches[4] );
 761                 }
 762                 if( is_null( $ret ) ) {
 763                         return htmlspecialchars( $matches[0] );
 764                 } else {
 765                         return $ret;
 766                 }
 767         }
 768
 769         /**
 770          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 771          * return the named entity reference as is. Otherwise, returns
 772          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 773          *
 774          * @param string $name
 775          * @return string
 776          */
 777         function normalizeEntity( $name ) {
 778                 global $wgHtmlEntities;
 779                 if( isset( $wgHtmlEntities[$name] ) ) {
 780                         return "&$name;";
 781                 } else {
 782                         return "&amp;$name;";
 783                 }
 784         }
 785
 786         function decCharReference( $codepoint ) {
 787                 $point = intval( $codepoint );
 788                 if( Sanitizer::validateCodepoint( $point ) ) {
 789                         return sprintf( '&#%d;', $point );
 790                 } else {
 791                         return null;
 792                 }
 793         }
 794
 795         function hexCharReference( $codepoint ) {
 796                 $point = hexdec( $codepoint );
 797                 if( Sanitizer::validateCodepoint( $point ) ) {
 798                         return sprintf( '&#x%x;', $point );
 799                 } else {
 800                         return null;
 801                 }
 802         }
 803
 804         /**
 805          * Returns true if a given Unicode codepoint is a valid character in XML.
 806          * @param int $codepoint
 807          * @return bool
 808          */
 809         function validateCodepoint( $codepoint ) {
 810                 return ($codepoint ==    0x09)
 811                         || ($codepoint ==    0x0a)
 812                         || ($codepoint ==    0x0d)
 813                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 814                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 815                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 816         }
 817
 818         /**
 819          * Decode any character references, numeric or named entities,
 820          * in the text and return a UTF-8 string.
 821          *
 822          * @param string $text
 823          * @return string
 824          * @public
 825          */
 826         function decodeCharReferences( $text ) {
 827                 return preg_replace_callback(
 828                         MW_CHAR_REFS_REGEX,
 829                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 830                         $text );
 831         }
 832
 833         /**
 834          * @param string $matches
 835          * @return string
 836          */
 837         function decodeCharReferencesCallback( $matches ) {
 838                 if( $matches[1] != '' ) {
 839                         return Sanitizer::decodeEntity( $matches[1] );
 840                 } elseif( $matches[2] != '' ) {
 841                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 842                 } elseif( $matches[3] != ''  ) {
 843                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 844                 } elseif( $matches[4] != '' ) {
 845                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 846                 }
 847                 # Last case should be an ampersand by itself
 848                 return $matches[0];
 849         }
 850
 851         /**
 852          * Return UTF-8 string for a codepoint if that is a valid
 853          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 854          * @param int $codepoint
 855          * @return string
 856          * @private
 857          */
 858         function decodeChar( $codepoint ) {
 859                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 860                         return codepointToUtf8( $codepoint );
 861                 } else {
 862                         return UTF8_REPLACEMENT;
 863                 }
 864         }
 865
 866         /**
 867          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 868          * return the UTF-8 encoding of that character. Otherwise, returns
 869          * pseudo-entity source (eg &foo;)
 870          *
 871          * @param string $name
 872          * @return string
 873          */
 874         function decodeEntity( $name ) {
 875                 global $wgHtmlEntities;
 876                 if( isset( $wgHtmlEntities[$name] ) ) {
 877                         return codepointToUtf8( $wgHtmlEntities[$name] );
 878                 } else {
 879                         return "&$name;";
 880                 }
 881         }
 882
 883         /**
 884          * Fetch the whitelist of acceptable attributes for a given
 885          * element name.
 886          *
 887          * @param string $element
 888          * @return array
 889          */
 890         function attributeWhitelist( $element ) {
 891                 static $list;
 892                 if( !isset( $list ) ) {
 893                         $list = Sanitizer::setupAttributeWhitelist();
 894                 }
 895                 return isset( $list[$element] )
 896                         ? $list[$element]
 897                         : array();
 898         }
 899
 900         /**
 901          * @return array
 902          */
 903         function setupAttributeWhitelist() {
 904                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 905                 $block = array_merge( $common, array( 'align' ) );
 906                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 907                 $tablecell = array( 'abbr',
 908                                     'axis',
 909                                     'headers',
 910                                     'scope',
 911                                     'rowspan',
 912                                     'colspan',
 913                                     'nowrap', # deprecated
 914                                     'width',  # deprecated
 915                                     'height', # deprecated
 916                                     'bgcolor' # deprecated
 917                                     );
 918
 919                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 920                 # See: http://www.w3.org/TR/html4/
 921                 $whitelist = array (
 922                         # 7.5.4
 923                         'div'        => $block,
 924                         'center'     => $common, # deprecated
 925                         'span'       => $block, # ??
 926
 927                         # 7.5.5
 928                         'h1'         => $block,
 929                         'h2'         => $block,
 930                         'h3'         => $block,
 931                         'h4'         => $block,
 932                         'h5'         => $block,
 933                         'h6'         => $block,
 934
 935                         # 7.5.6
 936                         # address
 937
 938                         # 8.2.4
 939                         # bdo
 940
 941                         # 9.2.1
 942                         'em'         => $common,
 943                         'strong'     => $common,
 944                         'cite'       => $common,
 945                         # dfn
 946                         'code'       => $common,
 947                         # samp
 948                         # kbd
 949                         'var'        => $common,
 950                         # abbr
 951                         # acronym
 952
 953                         # 9.2.2
 954                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 955                         # q
 956
 957                         # 9.2.3
 958                         'sub'        => $common,
 959                         'sup'        => $common,
 960
 961                         # 9.3.1
 962                         'p'          => $block,
 963
 964                         # 9.3.2
 965                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 966
 967                         # 9.3.4
 968                         'pre'        => array_merge( $common, array( 'width' ) ),
 969
 970                         # 9.4
 971                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 972                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 973
 974                         # 10.2
 975                         'ul'         => array_merge( $common, array( 'type' ) ),
 976                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 977                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 978
 979                         # 10.3
 980                         'dl'         => $common,
 981                         'dd'         => $common,
 982                         'dt'         => $common,
 983
 984                         # 11.2.1
 985                         'table'      => array_merge( $common,
 986                                                                 array( 'summary', 'width', 'border', 'frame',
 987                                                                                          'rules', 'cellspacing', 'cellpadding',
 988                                                                                          'align', 'bgcolor', 'frame', 'rules',
 989                                                                                          'border' ) ),
 990
 991                         # 11.2.2
 992                         'caption'    => array_merge( $common, array( 'align' ) ),
 993
 994                         # 11.2.3
 995                         'thead'      => array_merge( $common, $tablealign ),
 996                         'tfoot'      => array_merge( $common, $tablealign ),
 997                         'tbody'      => array_merge( $common, $tablealign ),
 998
 999                         # 11.2.4
1000                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1001                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1002
1003                         # 11.2.5
1004                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1005
1006                         # 11.2.6
1007                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1008                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1009
1010                         # 15.2.1
1011                         'tt'         => $common,
1012                         'b'          => $common,
1013                         'i'          => $common,
1014                         'big'        => $common,
1015                         'small'      => $common,
1016                         'strike'     => $common,
1017                         's'          => $common,
1018                         'u'          => $common,
1019
1020                         # 15.2.2
1021                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1022                         # basefont
1023
1024                         # 15.3
1025                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1026
1027                         # XHTML Ruby annotation text module, simple ruby only.
1028                         # http://www.w3c.org/TR/ruby/
1029                         'ruby'       => $common,
1030                         # rbc
1031                         # rtc
1032                         'rb'         => $common,
1033                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1034                         'rp'         => $common,
1035                         );
1036                 return $whitelist;
1037         }
1038
1039         /**
1040          * Take a fragment of (potentially invalid) HTML and return
1041          * a version with any tags removed, encoded suitably for literal
1042          * inclusion in an attribute value.
1043          *
1044          * @param string $text HTML fragment
1045          * @return string
1046          */
1047         function stripAllTags( $text ) {
1048                 # Actual <tags>
1049                 $text = preg_replace( '/ < .*? > /x', '', $text );
1050
1051                 # Normalize &entities and whitespace
1052                 $text = Sanitizer::normalizeAttributeValue( $text );
1053
1054                 # Will be placed into "double-quoted" attributes,
1055                 # make sure remaining bits are safe.
1056                 $text = str_replace(
1057                         array('<', '>', '"'),
1058                         array('&lt;', '&gt;', '&quot;'),
1059                         $text );
1060
1061                 return $text;
1062         }
1063
1064         /**
1065          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1066          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1067          * PHP 5.1 doesn't.
1068          *
1069          * Use for passing XHTML fragments to PHP's XML parsing functions
1070          *
1071          * @return string
1072          * @static
1073          */
1074         function hackDocType() {
1075                 global $wgHtmlEntities;
1076                 $out = "<!DOCTYPE html [\n";
1077                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1078                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1079                 }
1080                 $out .= "]>\n";
1081                 return $out;
1082         }
1083
1084 }
1085
1086 ?>