includes/Sanitizer.php

   1 <?php
   2 /**
   3  * (X)HTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @package MediaWiki
  24  * @subpackage Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @access private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /** @package MediaWiki */
 320 class Sanitizer {
 321         /**
 322          * Cleans up HTML, removes dangerous tags and attributes, and
 323          * removes HTML comments
 324          * @access private
 325          * @param string $text
 326          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 327          * @param array $args for the processing callback
 328          * @return string
 329          */
 330         function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
 331                 global $wgUseTidy, $wgUserHtml;
 332                 $fname = 'Parser::removeHTMLtags';
 333                 wfProfileIn( $fname );
 334
 335                 if( $wgUserHtml ) {
 336                         $htmlpairs = array( # Tags that must be closed
 337                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 338                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 339                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 340                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 341                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
 342                         );
 343                         $htmlsingle = array(
 344                                 'br', 'hr', 'li', 'dt', 'dd'
 345                         );
 346                         $htmlsingleonly = array( # Elements that cannot have close tags
 347                                 'br', 'hr'
 348                         );
 349                         $htmlnest = array( # Tags that can be nested--??
 350                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 351                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 352                         );
 353                         $tabletags = array( # Can only appear inside table
 354                                 'td', 'th', 'tr'
 355                         );
 356                 } else {
 357                         $htmlpairs = array();
 358                         $htmlsingle = array();
 359                         $htmlnest = array();
 360                         $tabletags = array();
 361                 }
 362
 363                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
 364                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
 365
 366                 # Remove HTML comments
 367                 $text = Sanitizer::removeHTMLcomments( $text );
 368
 369                 $bits = explode( '<', $text );
 370                 $text = array_shift( $bits );
 371                 if(!$wgUseTidy) {
 372                         $tagstack = array(); $tablestack = array();
 373                         foreach ( $bits as $x ) {
 374                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
 375                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 376                                 $x, $regs );
 377                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 378                                 error_reporting( $prev );
 379
 380                                 $badtag = 0 ;
 381                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 382                                         # Check our stack
 383                                         if ( $slash ) {
 384                                                 # Closing a tag...
 385                                                 if( in_array( $t, $htmlsingleonly ) ) {
 386                                                         $badtag = 1;
 387                                                 } elseif( !in_array( $t, $htmlsingle ) &&
 388                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
 389                                                         @array_push( $tagstack, $ot );
 390                                                         $badtag = 1;
 391                                                 } else {
 392                                                         if ( $t == 'table' ) {
 393                                                                 $tagstack = array_pop( $tablestack );
 394                                                         }
 395                                                         $newparams = '';
 396                                                 }
 397                                         } else {
 398                                                 # Keep track for later
 399                                                 if ( in_array( $t, $tabletags ) &&
 400                                                 ! in_array( 'table', $tagstack ) ) {
 401                                                         $badtag = 1;
 402                                                 } else if ( in_array( $t, $tagstack ) &&
 403                                                 ! in_array ( $t , $htmlnest ) ) {
 404                                                         $badtag = 1 ;
 405                                                 } elseif( in_array( $t, $htmlsingleonly ) ) {
 406                                                         # Hack to force empty tag for uncloseable elements
 407                                                         $brace = '/>';
 408                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 409                                                         if ( $t == 'table' ) {
 410                                                                 array_push( $tablestack, $tagstack );
 411                                                                 $tagstack = array();
 412                                                         }
 413                                                         array_push( $tagstack, $t );
 414                                                 }
 415
 416                                                 # Replace any variables or template parameters with
 417                                                 # plaintext results.
 418                                                 if( is_callable( $processCallback ) ) {
 419                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 420                                                 }
 421
 422                                                 # Strip non-approved attributes from the tag
 423                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 424                                         }
 425                                         if ( ! $badtag ) {
 426                                                 $rest = str_replace( '>', '&gt;', $rest );
 427                                                 $close = ( $brace == '/>' ) ? ' /' : '';
 428                                                 $text .= "<$slash$t$newparams$close>$rest";
 429                                                 continue;
 430                                         }
 431                                 }
 432                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 433                         }
 434                         # Close off any remaining tags
 435                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 436                                 $text .= "</$t>\n";
 437                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 438                         }
 439                 } else {
 440                         # this might be possible using tidy itself
 441                         foreach ( $bits as $x ) {
 442                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 443                                 $x, $regs );
 444                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 445                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 446                                         if( is_callable( $processCallback ) ) {
 447                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 448                                         }
 449                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 450                                         $rest = str_replace( '>', '&gt;', $rest );
 451                                         $text .= "<$slash$t$newparams$brace$rest";
 452                                 } else {
 453                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 454                                 }
 455                         }
 456                 }
 457                 wfProfileOut( $fname );
 458                 return $text;
 459         }
 460
 461         /**
 462          * Remove '<!--', '-->', and everything between.
 463          * To avoid leaving blank lines, when a comment is both preceded
 464          * and followed by a newline (ignoring spaces), trim leading and
 465          * trailing spaces and one of the newlines.
 466          *
 467          * @access private
 468          * @param string $text
 469          * @return string
 470          */
 471         function removeHTMLcomments( $text ) {
 472                 $fname='Parser::removeHTMLcomments';
 473                 wfProfileIn( $fname );
 474                 while (($start = strpos($text, '<!--')) !== false) {
 475                         $end = strpos($text, '-->', $start + 4);
 476                         if ($end === false) {
 477                                 # Unterminated comment; bail out
 478                                 break;
 479                         }
 480
 481                         $end += 3;
 482
 483                         # Trim space and newline if the comment is both
 484                         # preceded and followed by a newline
 485                         $spaceStart = max($start - 1, 0);
 486                         $spaceLen = $end - $spaceStart;
 487                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 488                                 $spaceStart--;
 489                                 $spaceLen++;
 490                         }
 491                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 492                                 $spaceLen++;
 493                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 494                                 # Remove the comment, leading and trailing
 495                                 # spaces, and leave only one newline.
 496                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 497                         }
 498                         else {
 499                                 # Remove just the comment.
 500                                 $text = substr_replace($text, '', $start, $end - $start);
 501                         }
 502                 }
 503                 wfProfileOut( $fname );
 504                 return $text;
 505         }
 506
 507         /**
 508          * Take a tag soup fragment listing an HTML element's attributes
 509          * and normalize it to well-formed XML, discarding unwanted attributes.
 510          *
 511          * - Normalizes attribute names to lowercase
 512          * - Discards attributes not on a whitelist for the given element
 513          * - Turns broken or invalid entities into plaintext
 514          * - Double-quotes all attribute values
 515          * - Attributes without values are given the name as attribute
 516          * - Double attributes are discarded
 517          * - Unsafe style attributes are discarded
 518          * - Prepends space if there are attributes.
 519          *
 520          * @param string $text
 521          * @param string $element
 522          * @return string
 523          *
 524          * @todo Check for legal values where the DTD limits things.
 525          * @todo Check for unique id attribute :P
 526          */
 527         function fixTagAttributes( $text, $element ) {
 528                 if( trim( $text ) == '' ) {
 529                         return '';
 530                 }
 531
 532                 # Unquoted attribute
 533                 # Since we quote this later, this can be anything distinguishable
 534                 # from the end of the attribute
 535                 if( !preg_match_all(
 536                         MW_ATTRIBS_REGEX,
 537                         $text,
 538                         $pairs,
 539                         PREG_SET_ORDER ) ) {
 540                         return '';
 541                 }
 542
 543                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 544                 $attribs = array();
 545                 foreach( $pairs as $set ) {
 546                         $attribute = strtolower( $set[1] );
 547                         if( !isset( $whitelist[$attribute] ) ) {
 548                                 continue;
 549                         }
 550
 551                         $raw   = Sanitizer::getTagAttributeCallback( $set );
 552                         $value = Sanitizer::normalizeAttributeValue( $raw );
 553
 554                         # Strip javascript "expression" from stylesheets.
 555                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 556                         if( $attribute == 'style' ) {
 557                                 $stripped = Sanitizer::decodeCharReferences( $value );
 558
 559                                 // Remove any comments; IE gets token splitting wrong
 560                                 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
 561                                 $value = htmlspecialchars( $stripped );
 562
 563                                 // ... and continue checks
 564                                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 565                                         'codepointToUtf8(hexdec("$1"))', $stripped );
 566                                 $stripped = str_replace( '\\', '', $stripped );
 567                                 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
 568                                                 $stripped ) ) {
 569                                         # haxx0r
 570                                         continue;
 571                                 }
 572                         }
 573
 574                         # Templates and links may be expanded in later parsing,
 575                         # creating invalid or dangerous output. Suppress this.
 576                         $value = strtr( $value, array(
 577                                 '{'    => '&#123;',
 578                                 '['    => '&#91;',
 579                                 "''"   => '&#39;&#39;',
 580                                 'ISBN' => '&#73;SBN',
 581                                 'RFC'  => '&#82;FC',
 582                                 'PMID' => '&#80;MID',
 583                         ) );
 584
 585                         # Stupid hack
 586                         $value = preg_replace_callback(
 587                                 '/(' . wfUrlProtocols() . ')/',
 588                                 array( 'Sanitizer', 'armorLinksCallback' ),
 589                                 $value );
 590
 591                         // If this attribute was previously set, override it.
 592                         // Output should only have one attribute of each name.
 593                         $attribs[$attribute] = "$attribute=\"$value\"";
 594                 }
 595                 if( empty( $attribs ) ) {
 596                         return '';
 597                 } else {
 598                         return ' ' . implode( ' ', $attribs );
 599                 }
 600         }
 601
 602         /**
 603          * Regex replace callback for armoring links against further processing.
 604          * @param array $matches
 605          * @return string
 606          * @access private
 607          */
 608         function armorLinksCallback( $matches ) {
 609                 return str_replace( ':', '&#58;', $matches[1] );
 610         }
 611
 612         /**
 613          * Return an associative array of attribute names and values from
 614          * a partial tag string. Attribute names are forces to lowercase,
 615          * character references are decoded to UTF-8 text.
 616          *
 617          * @param string
 618          * @return array
 619          */
 620         function decodeTagAttributes( $text ) {
 621                 $attribs = array();
 622
 623                 if( trim( $text ) == '' ) {
 624                         return $attribs;
 625                 }
 626
 627                 if( !preg_match_all(
 628                         MW_ATTRIBS_REGEX,
 629                         $text,
 630                         $pairs,
 631                         PREG_SET_ORDER ) ) {
 632                         return $attribs;
 633                 }
 634
 635                 foreach( $pairs as $set ) {
 636                         $attribute = strtolower( $set[1] );
 637                         $value = Sanitizer::getTagAttributeCallback( $set );
 638                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 639                 }
 640                 return $attribs;
 641         }
 642
 643         /**
 644          * Pick the appropriate attribute value from a match set from the
 645          * MW_ATTRIBS_REGEX matches.
 646          *
 647          * @param array $set
 648          * @return string
 649          * @access private
 650          */
 651         function getTagAttributeCallback( $set ) {
 652                 if( isset( $set[6] ) ) {
 653                         # Illegal #XXXXXX color with no quotes.
 654                         return $set[6];
 655                 } elseif( isset( $set[5] ) ) {
 656                         # No quotes.
 657                         return $set[5];
 658                 } elseif( isset( $set[4] ) ) {
 659                         # Single-quoted
 660                         return $set[4];
 661                 } elseif( isset( $set[3] ) ) {
 662                         # Double-quoted
 663                         return $set[3];
 664                 } elseif( !isset( $set[2] ) ) {
 665                         # In XHTML, attributes must have a value.
 666                         # For 'reduced' form, return explicitly the attribute name here.
 667                         return $set[1];
 668                 } else {
 669                         wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 670                 }
 671         }
 672
 673         /**
 674          * Normalize whitespace and character references in an XML source-
 675          * encoded text for an attribute value.
 676          *
 677          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 678          * but note that we're not returning the value, but are returning
 679          * XML source fragments that will be slapped into output.
 680          *
 681          * @param string $text
 682          * @return string
 683          * @access private
 684          */
 685         function normalizeAttributeValue( $text ) {
 686                 return str_replace( '"', '&quot;',
 687                         preg_replace(
 688                                 '/\r\n|[\x20\x0d\x0a\x09]/',
 689                                 ' ',
 690                                 Sanitizer::normalizeCharReferences( $text ) ) );
 691         }
 692
 693         /**
 694          * Ensure that any entities and character references are legal
 695          * for XML and XHTML specifically. Any stray bits will be
 696          * &amp;-escaped to result in a valid text fragment.
 697          *
 698          * a. any named char refs must be known in XHTML
 699          * b. any numeric char refs must be legal chars, not invalid or forbidden
 700          * c. use &#x, not &#X
 701          * d. fix or reject non-valid attributes
 702          *
 703          * @param string $text
 704          * @return string
 705          * @access private
 706          */
 707         function normalizeCharReferences( $text ) {
 708                 return preg_replace_callback(
 709                         MW_CHAR_REFS_REGEX,
 710                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 711                         $text );
 712         }
 713         /**
 714          * @param string $matches
 715          * @return string
 716          */
 717         function normalizeCharReferencesCallback( $matches ) {
 718                 $ret = null;
 719                 if( $matches[1] != '' ) {
 720                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 721                 } elseif( $matches[2] != '' ) {
 722                         $ret = Sanitizer::decCharReference( $matches[2] );
 723                 } elseif( $matches[3] != ''  ) {
 724                         $ret = Sanitizer::hexCharReference( $matches[3] );
 725                 } elseif( $matches[4] != '' ) {
 726                         $ret = Sanitizer::hexCharReference( $matches[4] );
 727                 }
 728                 if( is_null( $ret ) ) {
 729                         return htmlspecialchars( $matches[0] );
 730                 } else {
 731                         return $ret;
 732                 }
 733         }
 734
 735         /**
 736          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 737          * return the named entity reference as is. Otherwise, returns
 738          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 739          *
 740          * @param string $name
 741          * @return string
 742          */
 743         function normalizeEntity( $name ) {
 744                 global $wgHtmlEntities;
 745                 if( isset( $wgHtmlEntities[$name] ) ) {
 746                         return "&$name;";
 747                 } else {
 748                         return "&amp;$name;";
 749                 }
 750         }
 751
 752         function decCharReference( $codepoint ) {
 753                 $point = intval( $codepoint );
 754                 if( Sanitizer::validateCodepoint( $point ) ) {
 755                         return sprintf( '&#%d;', $point );
 756                 } else {
 757                         return null;
 758                 }
 759         }
 760
 761         function hexCharReference( $codepoint ) {
 762                 $point = hexdec( $codepoint );
 763                 if( Sanitizer::validateCodepoint( $point ) ) {
 764                         return sprintf( '&#x%x;', $point );
 765                 } else {
 766                         return null;
 767                 }
 768         }
 769
 770         /**
 771          * Returns true if a given Unicode codepoint is a valid character in XML.
 772          * @param int $codepoint
 773          * @return bool
 774          */
 775         function validateCodepoint( $codepoint ) {
 776                 return ($codepoint ==    0x09)
 777                         || ($codepoint ==    0x0a)
 778                         || ($codepoint ==    0x0d)
 779                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 780                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 781                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 782         }
 783
 784         /**
 785          * Decode any character references, numeric or named entities,
 786          * in the text and return a UTF-8 string.
 787          *
 788          * @param string $text
 789          * @return string
 790          * @access public
 791          */
 792         function decodeCharReferences( $text ) {
 793                 return preg_replace_callback(
 794                         MW_CHAR_REFS_REGEX,
 795                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
 796                         $text );
 797         }
 798
 799         /**
 800          * @param string $matches
 801          * @return string
 802          */
 803         function decodeCharReferencesCallback( $matches ) {
 804                 if( $matches[1] != '' ) {
 805                         return Sanitizer::decodeEntity( $matches[1] );
 806                 } elseif( $matches[2] != '' ) {
 807                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
 808                 } elseif( $matches[3] != ''  ) {
 809                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
 810                 } elseif( $matches[4] != '' ) {
 811                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
 812                 }
 813                 # Last case should be an ampersand by itself
 814                 return $matches[0];
 815         }
 816
 817         /**
 818          * Return UTF-8 string for a codepoint if that is a valid
 819          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
 820          * @param int $codepoint
 821          * @return string
 822          * @access private
 823          */
 824         function decodeChar( $codepoint ) {
 825                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
 826                         return codepointToUtf8( $codepoint );
 827                 } else {
 828                         return UTF8_REPLACEMENT;
 829                 }
 830         }
 831
 832         /**
 833          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 834          * return the UTF-8 encoding of that character. Otherwise, returns
 835          * pseudo-entity source (eg &foo;)
 836          *
 837          * @param string $name
 838          * @return string
 839          */
 840         function decodeEntity( $name ) {
 841                 global $wgHtmlEntities;
 842                 if( isset( $wgHtmlEntities[$name] ) ) {
 843                         return codepointToUtf8( $wgHtmlEntities[$name] );
 844                 } else {
 845                         return "&$name;";
 846                 }
 847         }
 848
 849         /**
 850          * Fetch the whitelist of acceptable attributes for a given
 851          * element name.
 852          *
 853          * @param string $element
 854          * @return array
 855          */
 856         function attributeWhitelist( $element ) {
 857                 static $list;
 858                 if( !isset( $list ) ) {
 859                         $list = Sanitizer::setupAttributeWhitelist();
 860                 }
 861                 return isset( $list[$element] )
 862                         ? $list[$element]
 863                         : array();
 864         }
 865
 866         /**
 867          * @return array
 868          */
 869         function setupAttributeWhitelist() {
 870                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 871                 $block = array_merge( $common, array( 'align' ) );
 872                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 873                 $tablecell = array( 'abbr',
 874                                     'axis',
 875                                     'headers',
 876                                     'scope',
 877                                     'rowspan',
 878                                     'colspan',
 879                                     'nowrap', # deprecated
 880                                     'width',  # deprecated
 881                                     'height', # deprecated
 882                                     'bgcolor' # deprecated
 883                                     );
 884
 885                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 886                 # See: http://www.w3.org/TR/html4/
 887                 $whitelist = array (
 888                         # 7.5.4
 889                         'div'        => $block,
 890                         'center'     => $common, # deprecated
 891                         'span'       => $block, # ??
 892
 893                         # 7.5.5
 894                         'h1'         => $block,
 895                         'h2'         => $block,
 896                         'h3'         => $block,
 897                         'h4'         => $block,
 898                         'h5'         => $block,
 899                         'h6'         => $block,
 900
 901                         # 7.5.6
 902                         # address
 903
 904                         # 8.2.4
 905                         # bdo
 906
 907                         # 9.2.1
 908                         'em'         => $common,
 909                         'strong'     => $common,
 910                         'cite'       => $common,
 911                         # dfn
 912                         'code'       => $common,
 913                         # samp
 914                         # kbd
 915                         'var'        => $common,
 916                         # abbr
 917                         # acronym
 918
 919                         # 9.2.2
 920                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 921                         # q
 922
 923                         # 9.2.3
 924                         'sub'        => $common,
 925                         'sup'        => $common,
 926
 927                         # 9.3.1
 928                         'p'          => $block,
 929
 930                         # 9.3.2
 931                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 932
 933                         # 9.3.4
 934                         'pre'        => array_merge( $common, array( 'width' ) ),
 935
 936                         # 9.4
 937                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 938                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 939
 940                         # 10.2
 941                         'ul'         => array_merge( $common, array( 'type' ) ),
 942                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 943                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 944
 945                         # 10.3
 946                         'dl'         => $common,
 947                         'dd'         => $common,
 948                         'dt'         => $common,
 949
 950                         # 11.2.1
 951                         'table'      => array_merge( $common,
 952                                                                 array( 'summary', 'width', 'border', 'frame',
 953                                                                                          'rules', 'cellspacing', 'cellpadding',
 954                                                                                          'align', 'bgcolor', 'frame', 'rules',
 955                                                                                          'border' ) ),
 956
 957                         # 11.2.2
 958                         'caption'    => array_merge( $common, array( 'align' ) ),
 959
 960                         # 11.2.3
 961                         'thead'      => array_merge( $common, $tablealign ),
 962                         'tfoot'      => array_merge( $common, $tablealign ),
 963                         'tbody'      => array_merge( $common, $tablealign ),
 964
 965                         # 11.2.4
 966                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 967                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 968
 969                         # 11.2.5
 970                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 971
 972                         # 11.2.6
 973                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 974                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 975
 976                         # 15.2.1
 977                         'tt'         => $common,
 978                         'b'          => $common,
 979                         'i'          => $common,
 980                         'big'        => $common,
 981                         'small'      => $common,
 982                         'strike'     => $common,
 983                         's'          => $common,
 984                         'u'          => $common,
 985
 986                         # 15.2.2
 987                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 988                         # basefont
 989
 990                         # 15.3
 991                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 992
 993                         # XHTML Ruby annotation text module, simple ruby only.
 994                         # http://www.w3c.org/TR/ruby/
 995                         'ruby'       => $common,
 996                         # rbc
 997                         # rtc
 998                         'rb'         => $common,
 999                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1000                         'rp'         => $common,
1001                         );
1002                 return $whitelist;
1003         }
1004
1005         /**
1006          * Take a fragment of (potentially invalid) HTML and return
1007          * a version with any tags removed, encoded suitably for literal
1008          * inclusion in an attribute value.
1009          *
1010          * @param string $text HTML fragment
1011          * @return string
1012          */
1013         function stripAllTags( $text ) {
1014                 # Actual <tags>
1015                 $text = preg_replace( '/<[^>]*>/', '', $text );
1016
1017                 # Normalize &entities and whitespace
1018                 $text = Sanitizer::normalizeAttributeValue( $text );
1019
1020                 # Will be placed into "double-quoted" attributes,
1021                 # make sure remaining bits are safe.
1022                 $text = str_replace(
1023                         array('<', '>', '"'),
1024                         array('&lt;', '&gt;', '&quot;'),
1025                         $text );
1026
1027                 return $text;
1028         }
1029
1030 }
1031
1032 ?>