includes/Sanitizer.php

   1 <?php
   2
   3 /**
   4  * (X)HTML sanitizer for MediaWiki
   5  *
   6  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @package MediaWiki
  25  * @subpackage Parser
  26  */
  27
  28 class Sanitizer {
  29         /**
  30          * Cleans up HTML, removes dangerous tags and attributes, and
  31          * removes HTML comments
  32          * @access private
  33          * @param string $text
  34          * @return string
  35          */
  36         function removeHTMLtags( $text ) {
  37                 global $wgUseTidy, $wgUserHtml;
  38                 $fname = 'Parser::removeHTMLtags';
  39                 wfProfileIn( $fname );
  40
  41                 if( $wgUserHtml ) {
  42                         $htmlpairs = array( # Tags that must be closed
  43                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
  44                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
  45                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
  46                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
  47                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
  48                         );
  49                         $htmlsingle = array(
  50                                 'br', 'hr', 'li', 'dt', 'dd'
  51                         );
  52                         $htmlnest = array( # Tags that can be nested--??
  53                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
  54                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
  55                         );
  56                         $tabletags = array( # Can only appear inside table
  57                                 'td', 'th', 'tr'
  58                         );
  59                 } else {
  60                         $htmlpairs = array();
  61                         $htmlsingle = array();
  62                         $htmlnest = array();
  63                         $tabletags = array();
  64                 }
  65
  66                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
  67                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
  68
  69                 # Remove HTML comments
  70                 $text = Sanitizer::removeHTMLcomments( $text );
  71
  72                 $bits = explode( '<', $text );
  73                 $text = array_shift( $bits );
  74                 if(!$wgUseTidy) {
  75                         $tagstack = array(); $tablestack = array();
  76                         foreach ( $bits as $x ) {
  77                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
  78                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
  79                                 $x, $regs );
  80                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
  81                                 error_reporting( $prev );
  82
  83                                 $badtag = 0 ;
  84                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
  85                                         # Check our stack
  86                                         if ( $slash ) {
  87                                                 # Closing a tag...
  88                                                 if ( ! in_array( $t, $htmlsingle ) &&
  89                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
  90                                                         @array_push( $tagstack, $ot );
  91                                                         $badtag = 1;
  92                                                 } else {
  93                                                         if ( $t == 'table' ) {
  94                                                                 $tagstack = array_pop( $tablestack );
  95                                                         }
  96                                                         $newparams = '';
  97                                                 }
  98                                         } else {
  99                                                 # Keep track for later
 100                                                 if ( in_array( $t, $tabletags ) &&
 101                                                 ! in_array( 'table', $tagstack ) ) {
 102                                                         $badtag = 1;
 103                                                 } else if ( in_array( $t, $tagstack ) &&
 104                                                 ! in_array ( $t , $htmlnest ) ) {
 105                                                         $badtag = 1 ;
 106                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 107                                                         if ( $t == 'table' ) {
 108                                                                 array_push( $tablestack, $tagstack );
 109                                                                 $tagstack = array();
 110                                                         }
 111                                                         array_push( $tagstack, $t );
 112                                                 }
 113                                                 # Strip non-approved attributes from the tag
 114                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 115                                         }
 116                                         if ( ! $badtag ) {
 117                                                 $rest = str_replace( '>', '&gt;', $rest );
 118                                                 $text .= "<$slash$t$newparams$brace$rest";
 119                                                 continue;
 120                                         }
 121                                 }
 122                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 123                         }
 124                         # Close off any remaining tags
 125                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 126                                 $text .= "</$t>\n";
 127                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 128                         }
 129                 } else {
 130                         # this might be possible using tidy itself
 131                         foreach ( $bits as $x ) {
 132                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 133                                 $x, $regs );
 134                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 135                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 136                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 137                                         $rest = str_replace( '>', '&gt;', $rest );
 138                                         $text .= "<$slash$t$newparams$brace$rest";
 139                                 } else {
 140                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 141                                 }
 142                         }
 143                 }
 144                 wfProfileOut( $fname );
 145                 return $text;
 146         }
 147
 148         /**
 149          * Remove '<!--', '-->', and everything between.
 150          * To avoid leaving blank lines, when a comment is both preceded
 151          * and followed by a newline (ignoring spaces), trim leading and
 152          * trailing spaces and one of the newlines.
 153          *
 154          * @access private
 155          * @param string $text
 156          * @return string
 157          */
 158         function removeHTMLcomments( $text ) {
 159                 $fname='Parser::removeHTMLcomments';
 160                 wfProfileIn( $fname );
 161                 while (($start = strpos($text, '<!--')) !== false) {
 162                         $end = strpos($text, '-->', $start + 4);
 163                         if ($end === false) {
 164                                 # Unterminated comment; bail out
 165                                 break;
 166                         }
 167
 168                         $end += 3;
 169
 170                         # Trim space and newline if the comment is both
 171                         # preceded and followed by a newline
 172                         $spaceStart = max($start - 1, 0);
 173                         $spaceLen = $end - $spaceStart;
 174                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 175                                 $spaceStart--;
 176                                 $spaceLen++;
 177                         }
 178                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 179                                 $spaceLen++;
 180                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 181                                 # Remove the comment, leading and trailing
 182                                 # spaces, and leave only one newline.
 183                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 184                         }
 185                         else {
 186                                 # Remove just the comment.
 187                                 $text = substr_replace($text, '', $start, $end - $start);
 188                         }
 189                 }
 190                 wfProfileOut( $fname );
 191                 return $text;
 192         }
 193
 194         /**
 195          * Take a tag soup fragment listing an HTML element's attributes
 196          * and normalize it to well-formed XML, discarding unwanted attributes.
 197          *
 198          * - Normalizes attribute names to lowercase
 199          * - Discards attributes not on a whitelist for the given element
 200          * - Turns broken or invalid entities into plaintext
 201          * - Double-quotes all attribute values
 202          * - Attributes without values are given the name as attribute
 203          * - Double attributes are discarded
 204          * - Unsafe style attributes are discarded
 205          * - Prepends space if there are attributes.
 206          *
 207          * @param string $text
 208          * @param string $element
 209          * @return string
 210          *
 211          * @todo Check for legal values where the DTD limits things.
 212          * @todo Check for unique id attribute :P
 213          */
 214         function fixTagAttributes( $text, $element ) {
 215                 if( trim( $text ) == '' ) {
 216                         return '';
 217                 }
 218
 219                 # Unquoted attribute
 220                 # Since we quote this later, this can be anything distinguishable
 221                 # from the end of the attribute
 222                 $attrib = '[A-Za-z0-9]';
 223                 $space = '[\x09\x0a\x0d\x20]';
 224                 if( !preg_match_all(
 225                         "/(?:^|$space)($attrib+)
 226                           ($space*=$space*
 227                             (?:
 228                              # The attribute value: quoted or alone
 229                               \"([^<\"]*)\"
 230                              | '([^<']*)'
 231                              |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
 232                              |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
 233                                                  # colors are specified like this.
 234                                                  # We'll be normalizing it.
 235                             )
 236                            )?(?=$space|\$)/sx",
 237                         $text,
 238                         $pairs,
 239                         PREG_SET_ORDER ) ) {
 240                         return '';
 241                 }
 242
 243                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 244                 $attribs = array();
 245                 foreach( $pairs as $set ) {
 246                         $attribute = strtolower( $set[1] );
 247                         if( !isset( $whitelist[$attribute] ) ) {
 248                                 continue;
 249                         } elseif( isset( $set[6] ) ) {
 250                                 # Illegal #XXXXXX color with no quotes.
 251                                 $value = Sanitizer::normalizeAttributeValue( $set[6] );
 252                         } elseif( isset( $set[5] ) ) {
 253                                 # No quotes.
 254                                 $value = Sanitizer::normalizeAttributeValue( $set[5] );
 255                         } elseif( isset( $set[4] ) ) {
 256                                 # Single-quoted
 257                                 $value = str_replace( '"', '&quot;',
 258                                         Sanitizer::normalizeAttributeValue( $set[4] ) );
 259                         } elseif( isset( $set[3] ) ) {
 260                                 # Double-quoted
 261                                 $value = Sanitizer::normalizeAttributeValue( $set[3] );
 262                         } elseif( !isset( $set[2] ) ) {
 263                                 # In XHTML, attributes must have a value.
 264                                 $value = $set[1];
 265                         } else {
 266                                 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 267                         }
 268
 269                         # Strip javascript "expression" from stylesheets.
 270                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 271                         if( $attribute == 'style' && preg_match(
 272                                 '/(expression|tps*:\/\/|url\\s*\().*/is',
 273                                         wfMungeToUtf8( $value ) ) ) {
 274                                 # haxx0r
 275                                 continue;
 276                         }
 277
 278                         if( !isset( $attribs[$attribute] ) ) {
 279                                 $attribs[$attribute] = "$attribute=\"$value\"";
 280                         }
 281                 }
 282                 if( empty( $attribs ) ) {
 283                         return '';
 284                 } else {
 285                         return ' ' . implode( ' ', $attribs );
 286                 }
 287         }
 288
 289         /**
 290          * Normalize whitespace and character references in an XML source-
 291          * encoded text for an attribute value.
 292          *
 293          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 294          * but note that we're not returning the value, but are returning
 295          * XML source fragments that will be slapped into output.
 296          *
 297          * @param string $text
 298          * @return string
 299          * @access private
 300          */
 301         function normalizeAttributeValue( $text ) {
 302                 return preg_replace(
 303                         '/\r\n|[\x20\x0d\x0a\x09]/',
 304                         ' ',
 305                         Sanitizer::normalizeCharReferences( $text ) );
 306         }
 307
 308         /**
 309          * Ensure that any entities and character references are legal
 310          * for XML and XHTML specifically. Any stray bits will be
 311          * &amp;-escaped to result in a valid text fragment.
 312          *
 313          * a. any named char refs must be known in XHTML
 314          * b. any numeric char refs must be legal chars, not invalid or forbidden
 315          * c. use &#x, not &#X
 316          * d. fix or reject non-valid attributes
 317          *
 318          * @param string $text
 319          * @return string
 320          * @access private
 321          */
 322         function normalizeCharReferences( $text ) {
 323                 return preg_replace_callback(
 324                         '/&([A-Za-z0-9]+);
 325                          |&\#([0-9]+);
 326                          |&\#x([0-9A-Za-z]+);
 327                          |&\#X([0-9A-Za-z]+);
 328                          |(&)/x',
 329                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 330                         $text );
 331         }
 332         /**
 333          * @param string $matches
 334          * @return string
 335          */
 336         function normalizeCharReferencesCallback( $matches ) {
 337                 $ret = null;
 338                 if( $matches[1] != '' ) {
 339                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 340                 } elseif( $matches[2] != '' ) {
 341                         $ret = Sanitizer::decCharReference( $matches[2] );
 342                 } elseif( $matches[3] != ''  ) {
 343                         $ret = Sanitizer::hexCharReference( $matches[3] );
 344                 } elseif( $matches[4] != '' ) {
 345                         $ret = Sanitizer::hexCharReference( $matches[4] );
 346                 }
 347                 if( is_null( $ret ) ) {
 348                         return htmlspecialchars( $matches[0] );
 349                 } else {
 350                         return $ret;
 351                 }
 352         }
 353
 354         /**
 355          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 356          * return the named entity reference as is. Otherwise, returns
 357          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 358          *
 359          * @param string $name
 360          * @return string
 361          */
 362         function normalizeEntity( $name ) {
 363                 # List of all named character entities defined in HTML 4.01
 364                 # http://www.w3.org/TR/html4/sgml/entities.html
 365                 static $htmlEntities = array(
 366                         'aacute' => true,
 367                         'Aacute' => true,
 368                         'acirc' => true,
 369                         'Acirc' => true,
 370                         'acute' => true,
 371                         'aelig' => true,
 372                         'AElig' => true,
 373                         'agrave' => true,
 374                         'Agrave' => true,
 375                         'alefsym' => true,
 376                         'alpha' => true,
 377                         'Alpha' => true,
 378                         'amp' => true,
 379                         'and' => true,
 380                         'ang' => true,
 381                         'apos' => true,
 382                         'aring' => true,
 383                         'Aring' => true,
 384                         'asymp' => true,
 385                         'atilde' => true,
 386                         'Atilde' => true,
 387                         'auml' => true,
 388                         'Auml' => true,
 389                         'bdquo' => true,
 390                         'beta' => true,
 391                         'Beta' => true,
 392                         'brvbar' => true,
 393                         'bull' => true,
 394                         'cap' => true,
 395                         'ccedil' => true,
 396                         'Ccedil' => true,
 397                         'cedil' => true,
 398                         'cent' => true,
 399                         'chi' => true,
 400                         'Chi' => true,
 401                         'circ' => true,
 402                         'clubs' => true,
 403                         'cong' => true,
 404                         'copy' => true,
 405                         'crarr' => true,
 406                         'cup' => true,
 407                         'curren' => true,
 408                         'dagger' => true,
 409                         'Dagger' => true,
 410                         'darr' => true,
 411                         'dArr' => true,
 412                         'deg' => true,
 413                         'delta' => true,
 414                         'Delta' => true,
 415                         'diams' => true,
 416                         'divide' => true,
 417                         'eacute' => true,
 418                         'Eacute' => true,
 419                         'ecirc' => true,
 420                         'Ecirc' => true,
 421                         'egrave' => true,
 422                         'Egrave' => true,
 423                         'empty' => true,
 424                         'emsp' => true,
 425                         'ensp' => true,
 426                         'epsilon' => true,
 427                         'Epsilon' => true,
 428                         'equiv' => true,
 429                         'eta' => true,
 430                         'Eta' => true,
 431                         'eth' => true,
 432                         'ETH' => true,
 433                         'euml' => true,
 434                         'Euml' => true,
 435                         'euro' => true,
 436                         'exist' => true,
 437                         'fnof' => true,
 438                         'forall' => true,
 439                         'frac12' => true,
 440                         'frac14' => true,
 441                         'frac34' => true,
 442                         'frasl' => true,
 443                         'gamma' => true,
 444                         'Gamma' => true,
 445                         'ge' => true,
 446                         'gt' => true,
 447                         'harr' => true,
 448                         'hArr' => true,
 449                         'hearts' => true,
 450                         'hellip' => true,
 451                         'iacute' => true,
 452                         'Iacute' => true,
 453                         'icirc' => true,
 454                         'Icirc' => true,
 455                         'iexcl' => true,
 456                         'igrave' => true,
 457                         'Igrave' => true,
 458                         'image' => true,
 459                         'infin' => true,
 460                         'int' => true,
 461                         'iota' => true,
 462                         'Iota' => true,
 463                         'iquest' => true,
 464                         'isin' => true,
 465                         'iuml' => true,
 466                         'Iuml' => true,
 467                         'kappa' => true,
 468                         'Kappa' => true,
 469                         'lambda' => true,
 470                         'Lambda' => true,
 471                         'lang' => true,
 472                         'laquo' => true,
 473                         'larr' => true,
 474                         'lArr' => true,
 475                         'lceil' => true,
 476                         'ldquo' => true,
 477                         'le' => true,
 478                         'lfloor' => true,
 479                         'lowast' => true,
 480                         'loz' => true,
 481                         'lrm' => true,
 482                         'lsaquo' => true,
 483                         'lsquo' => true,
 484                         'lt' => true,
 485                         'macr' => true,
 486                         'mdash' => true,
 487                         'micro' => true,
 488                         'middot' => true,
 489                         'minus' => true,
 490                         'mu' => true,
 491                         'Mu' => true,
 492                         'nabla' => true,
 493                         'nbsp' => true,
 494                         'ndash' => true,
 495                         'ne' => true,
 496                         'ni' => true,
 497                         'not' => true,
 498                         'notin' => true,
 499                         'nsub' => true,
 500                         'ntilde' => true,
 501                         'Ntilde' => true,
 502                         'nu' => true,
 503                         'Nu' => true,
 504                         'oacute' => true,
 505                         'Oacute' => true,
 506                         'ocirc' => true,
 507                         'Ocirc' => true,
 508                         'oelig' => true,
 509                         'OElig' => true,
 510                         'ograve' => true,
 511                         'Ograve' => true,
 512                         'oline' => true,
 513                         'omega' => true,
 514                         'Omega' => true,
 515                         'omicron' => true,
 516                         'Omicron' => true,
 517                         'oplus' => true,
 518                         'or' => true,
 519                         'ordf' => true,
 520                         'ordm' => true,
 521                         'oslash' => true,
 522                         'Oslash' => true,
 523                         'otilde' => true,
 524                         'Otilde' => true,
 525                         'otimes' => true,
 526                         'ouml' => true,
 527                         'Ouml' => true,
 528                         'para' => true,
 529                         'part' => true,
 530                         'permil' => true,
 531                         'perp' => true,
 532                         'phi' => true,
 533                         'Phi' => true,
 534                         'pi' => true,
 535                         'Pi' => true,
 536                         'piv' => true,
 537                         'plusmn' => true,
 538                         'pound' => true,
 539                         'prime' => true,
 540                         'Prime' => true,
 541                         'prod' => true,
 542                         'prop' => true,
 543                         'psi' => true,
 544                         'Psi' => true,
 545                         'quot' => true,
 546                         'radic' => true,
 547                         'rang' => true,
 548                         'raquo' => true,
 549                         'rarr' => true,
 550                         'rArr' => true,
 551                         'rceil' => true,
 552                         'rdquo' => true,
 553                         'real' => true,
 554                         'reg' => true,
 555                         'rfloor' => true,
 556                         'rho' => true,
 557                         'Rho' => true,
 558                         'rlm' => true,
 559                         'rsaquo' => true,
 560                         'rsquo' => true,
 561                         'sbquo' => true,
 562                         'scaron' => true,
 563                         'Scaron' => true,
 564                         'sdot' => true,
 565                         'sect' => true,
 566                         'shy' => true,
 567                         'sigma' => true,
 568                         'Sigma' => true,
 569                         'sigmaf' => true,
 570                         'sim' => true,
 571                         'spades' => true,
 572                         'sub' => true,
 573                         'sube' => true,
 574                         'sum' => true,
 575                         'sup' => true,
 576                         'sup1' => true,
 577                         'sup2' => true,
 578                         'sup3' => true,
 579                         'supe' => true,
 580                         'szlig' => true,
 581                         'tau' => true,
 582                         'Tau' => true,
 583                         'there4' => true,
 584                         'theta' => true,
 585                         'Theta' => true,
 586                         'thetasym' => true,
 587                         'thinsp' => true,
 588                         'thorn' => true,
 589                         'THORN' => true,
 590                         'tilde' => true,
 591                         'times' => true,
 592                         'trade' => true,
 593                         'uacute' => true,
 594                         'Uacute' => true,
 595                         'uarr' => true,
 596                         'uArr' => true,
 597                         'ucirc' => true,
 598                         'Ucirc' => true,
 599                         'ugrave' => true,
 600                         'Ugrave' => true,
 601                         'uml' => true,
 602                         'upsih' => true,
 603                         'upsilon' => true,
 604                         'Upsilon' => true,
 605                         'uuml' => true,
 606                         'Uuml' => true,
 607                         'weierp' => true,
 608                         'xi' => true,
 609                         'Xi' => true,
 610                         'yacute' => true,
 611                         'Yacute' => true,
 612                         'yen' => true,
 613                         'yuml' => true,
 614                         'Yuml' => true,
 615                         'zeta' => true,
 616                         'Zeta' => true,
 617                         'zwj' => true,
 618                         'zwnj' => true );
 619                 if( isset( $htmlEntities[$name] ) ) {
 620                         return "&$name;";
 621                 } else {
 622                         return "&amp;$name;";
 623                 }
 624         }
 625
 626         function decCharReference( $codepoint ) {
 627                 $point = IntVal( $codepoint );
 628                 if( Sanitizer::validateCodepoint( $point ) ) {
 629                         return sprintf( '&#%d;', $point );
 630                 } else {
 631                         return null;
 632                 }
 633         }
 634
 635         function hexCharReference( $codepoint ) {
 636                 $point = hexdec( $codepoint );
 637                 if( Sanitizer::validateCodepoint( $point ) ) {
 638                         return sprintf( '&#x%x;', $point );
 639                 } else {
 640                         return null;
 641                 }
 642         }
 643
 644         /**
 645          * Returns true if a given Unicode codepoint is a valid character in XML.
 646          * @param int $codepoint
 647          * @return bool
 648          */
 649         function validateCodepoint( $codepoint ) {
 650                 return ($codepoint ==    0x09)
 651                         || ($codepoint ==    0x0a)
 652                         || ($codepoint ==    0x0d)
 653                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 654                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 655                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 656         }
 657
 658         /**
 659          * Fetch the whitelist of acceptable attributes for a given
 660          * element name.
 661          *
 662          * @param string $element
 663          * @return array
 664          */
 665         function attributeWhitelist( $element ) {
 666                 $list = Sanitizer::setupAttributeWhitelist();
 667                 return isset( $list[$element] )
 668                         ? $list[$element]
 669                         : array();
 670         }
 671
 672         /**
 673          * @return array
 674          */
 675         function setupAttributeWhitelist() {
 676                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 677                 $block = array_merge( $common, array( 'align' ) );
 678                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 679                 $tablecell = array( 'abbr',
 680                                     'axis',
 681                                     'headers',
 682                                     'scope',
 683                                     'rowspan',
 684                                     'colspan',
 685                                     'nowrap', # deprecated
 686                                     'width',  # deprecated
 687                                     'height', # deprecated
 688                                     'bgcolor' # deprecated
 689                                     );
 690
 691                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 692                 # See: http://www.w3.org/TR/html4/
 693                 $whitelist = array (
 694                         # 7.5.4
 695                         'div'        => $block,
 696                         'center'     => $common, # deprecated
 697                         'span'       => $block, # ??
 698
 699                         # 7.5.5
 700                         'h1'         => $block,
 701                         'h2'         => $block,
 702                         'h3'         => $block,
 703                         'h4'         => $block,
 704                         'h5'         => $block,
 705                         'h6'         => $block,
 706
 707                         # 7.5.6
 708                         # address
 709
 710                         # 8.2.4
 711                         # bdo
 712
 713                         # 9.2.1
 714                         'em'         => $common,
 715                         'strong'     => $common,
 716                         'cite'       => $common,
 717                         # dfn
 718                         'code'       => $common,
 719                         # samp
 720                         # kbd
 721                         'var'        => $common,
 722                         # abbr
 723                         # acronym
 724
 725                         # 9.2.2
 726                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 727                         # q
 728
 729                         # 9.2.3
 730                         'sub'        => $common,
 731                         'sup'        => $common,
 732
 733                         # 9.3.1
 734                         'p'          => $block,
 735
 736                         # 9.3.2
 737                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 738
 739                         # 9.3.4
 740                         'pre'        => array_merge( $common, array( 'width' ) ),
 741
 742                         # 9.4
 743                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 744                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 745
 746                         # 10.2
 747                         'ul'         => array_merge( $common, array( 'type' ) ),
 748                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 749                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 750
 751                         # 10.3
 752                         'dl'         => $common,
 753                         'dd'         => $common,
 754                         'dt'         => $common,
 755
 756                         # 11.2.1
 757                         'table'      => array_merge( $common,
 758                                                                 array( 'summary', 'width', 'border', 'frame',
 759                                                                                          'rules', 'cellspacing', 'cellpadding',
 760                                                                                          'align', 'bgcolor', 'frame', 'rules',
 761                                                                                          'border' ) ),
 762
 763                         # 11.2.2
 764                         'caption'    => array_merge( $common, array( 'align' ) ),
 765
 766                         # 11.2.3
 767                         'thead'      => array_merge( $common, $tablealign ),
 768                         'tfoot'      => array_merge( $common, $tablealign ),
 769                         'tbody'      => array_merge( $common, $tablealign ),
 770
 771                         # 11.2.4
 772                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 773                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 774
 775                         # 11.2.5
 776                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 777
 778                         # 11.2.6
 779                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 780                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 781
 782                         # 15.2.1
 783                         'tt'         => $common,
 784                         'b'          => $common,
 785                         'i'          => $common,
 786                         'big'        => $common,
 787                         'small'      => $common,
 788                         'strike'     => $common,
 789                         's'          => $common,
 790                         'u'          => $common,
 791
 792                         # 15.2.2
 793                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 794                         # basefont
 795
 796                         # 15.3
 797                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 798
 799                         # XHTML Ruby annotation text module, simple ruby only.
 800                         # http://www.w3c.org/TR/ruby/
 801                         'ruby'       => $common,
 802                         # rbc
 803                         # rtc
 804                         'rb'         => $common,
 805                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 806                         'rp'         => $common,
 807                         );
 808                 return $whitelist;
 809         }
 810
 811         /**
 812          * Take a fragment of (potentially invalid) HTML and return
 813          * a version with any tags removed, encoded suitably for literal
 814          * inclusion in an attribute value.
 815          *
 816          * @param string $text HTML fragment
 817          * @return string
 818          */
 819         function stripAllTags( $text ) {
 820                 # Actual <tags>
 821                 $text = preg_replace( '/<[^>]*>/', '', $text );
 822
 823                 # Normalize &entities and whitespace
 824                 $text = Sanitizer::normalizeAttributeValue( $text );
 825
 826                 # Will be placed into "double-quoted" attributes,
 827                 # make sure remaining bits are safe.
 828                 $text = str_replace(
 829                         array('<', '>', '"'),
 830                         array('&lt;', '&gt;', '&quot;'),
 831                         $text );
 832
 833                 return $text;
 834         }
 835
 836 }
 837
 838 ?>