includes/Sanitizer.php

   1 <?php
   2
   3 /**
   4  * (X)HTML sanitizer for MediaWiki
   5  *
   6  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @package MediaWiki
  25  * @subpackage Parser
  26  */
  27
  28 class Sanitizer {
  29         /**
  30          * Cleans up HTML, removes dangerous tags and attributes, and
  31          * removes HTML comments
  32          * @access private
  33          * @param string $text
  34          * @return string
  35          */
  36         function removeHTMLtags( $text ) {
  37                 global $wgUseTidy, $wgUserHtml;
  38                 $fname = 'Parser::removeHTMLtags';
  39                 wfProfileIn( $fname );
  40
  41                 if( $wgUserHtml ) {
  42                         $htmlpairs = array( # Tags that must be closed
  43                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
  44                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
  45                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
  46                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
  47                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
  48                         );
  49                         $htmlsingle = array(
  50                                 'br', 'hr', 'li', 'dt', 'dd'
  51                         );
  52                         $htmlnest = array( # Tags that can be nested--??
  53                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
  54                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
  55                         );
  56                         $tabletags = array( # Can only appear inside table
  57                                 'td', 'th', 'tr'
  58                         );
  59                 } else {
  60                         $htmlpairs = array();
  61                         $htmlsingle = array();
  62                         $htmlnest = array();
  63                         $tabletags = array();
  64                 }
  65
  66                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
  67                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
  68
  69                 # Remove HTML comments
  70                 $text = Sanitizer::removeHTMLcomments( $text );
  71
  72                 $bits = explode( '<', $text );
  73                 $text = array_shift( $bits );
  74                 if(!$wgUseTidy) {
  75                         $tagstack = array(); $tablestack = array();
  76                         foreach ( $bits as $x ) {
  77                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
  78                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
  79                                 $x, $regs );
  80                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
  81                                 error_reporting( $prev );
  82
  83                                 $badtag = 0 ;
  84                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
  85                                         # Check our stack
  86                                         if ( $slash ) {
  87                                                 # Closing a tag...
  88                                                 if ( ! in_array( $t, $htmlsingle ) &&
  89                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
  90                                                         @array_push( $tagstack, $ot );
  91                                                         $badtag = 1;
  92                                                 } else {
  93                                                         if ( $t == 'table' ) {
  94                                                                 $tagstack = array_pop( $tablestack );
  95                                                         }
  96                                                         $newparams = '';
  97                                                 }
  98                                         } else {
  99                                                 # Keep track for later
 100                                                 if ( in_array( $t, $tabletags ) &&
 101                                                 ! in_array( 'table', $tagstack ) ) {
 102                                                         $badtag = 1;
 103                                                 } else if ( in_array( $t, $tagstack ) &&
 104                                                 ! in_array ( $t , $htmlnest ) ) {
 105                                                         $badtag = 1 ;
 106                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 107                                                         if ( $t == 'table' ) {
 108                                                                 array_push( $tablestack, $tagstack );
 109                                                                 $tagstack = array();
 110                                                         }
 111                                                         array_push( $tagstack, $t );
 112                                                 }
 113                                                 # Strip non-approved attributes from the tag
 114                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 115                                         }
 116                                         if ( ! $badtag ) {
 117                                                 $rest = str_replace( '>', '&gt;', $rest );
 118                                                 $text .= "<$slash$t$newparams$brace$rest";
 119                                                 continue;
 120                                         }
 121                                 }
 122                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 123                         }
 124                         # Close off any remaining tags
 125                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 126                                 $text .= "</$t>\n";
 127                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 128                         }
 129                 } else {
 130                         # this might be possible using tidy itself
 131                         foreach ( $bits as $x ) {
 132                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 133                                 $x, $regs );
 134                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 135                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 136                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 137                                         $rest = str_replace( '>', '&gt;', $rest );
 138                                         $text .= "<$slash$t$newparams$brace$rest";
 139                                 } else {
 140                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 141                                 }
 142                         }
 143                 }
 144                 wfProfileOut( $fname );
 145                 return $text;
 146         }
 147
 148         /**
 149          * Remove '<!--', '-->', and everything between.
 150          * To avoid leaving blank lines, when a comment is both preceded
 151          * and followed by a newline (ignoring spaces), trim leading and
 152          * trailing spaces and one of the newlines.
 153          *
 154          * @access private
 155          * @param string $text
 156          * @return string
 157          */
 158         function removeHTMLcomments( $text ) {
 159                 $fname='Parser::removeHTMLcomments';
 160                 wfProfileIn( $fname );
 161                 while (($start = strpos($text, '<!--')) !== false) {
 162                         $end = strpos($text, '-->', $start + 4);
 163                         if ($end === false) {
 164                                 # Unterminated comment; bail out
 165                                 break;
 166                         }
 167
 168                         $end += 3;
 169
 170                         # Trim space and newline if the comment is both
 171                         # preceded and followed by a newline
 172                         $spaceStart = max($start - 1, 0);
 173                         $spaceLen = $end - $spaceStart;
 174                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 175                                 $spaceStart--;
 176                                 $spaceLen++;
 177                         }
 178                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 179                                 $spaceLen++;
 180                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 181                                 # Remove the comment, leading and trailing
 182                                 # spaces, and leave only one newline.
 183                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 184                         }
 185                         else {
 186                                 # Remove just the comment.
 187                                 $text = substr_replace($text, '', $start, $end - $start);
 188                         }
 189                 }
 190                 wfProfileOut( $fname );
 191                 return $text;
 192         }
 193
 194         /**
 195          * Take a tag soup fragment listing an HTML element's attributes
 196          * and normalize it to well-formed XML, discarding unwanted attributes.
 197          *
 198          * - Normalizes attribute names to lowercase
 199          * - Discards attributes not on a whitelist for the given element
 200          * - Turns broken or invalid entities into plaintext
 201          * - Double-quotes all attribute values
 202          * - Attributes without values are given the name as attribute
 203          * - Double attributes are discarded
 204          * - Unsafe style attributes are discarded
 205          * - Prepends space if there are attributes.
 206          *
 207          * @param string $text
 208          * @param string $element
 209          * @return string
 210          *
 211          * @todo Check for legal values where the DTD limits things.
 212          * @todo Check for unique id attribute :P
 213          */
 214         function fixTagAttributes( $text, $element ) {
 215                 if( trim( $text ) == '' ) {
 216                         return '';
 217                 }
 218
 219                 # Unquoted attribute
 220                 # Since we quote this later, this can be anything distinguishable
 221                 # from the end of the attribute
 222                 $attrib = '[A-Za-z0-9]';
 223                 $space = '[\x09\x0a\x0d\x20]';
 224                 if( !preg_match_all(
 225                         "/(?:^|$space)($attrib+)
 226                           ($space*=$space*
 227                             (?:
 228                              # The attribute value: quoted or alone
 229                               \"([^<\"]*)\"
 230                              | '([^<']*)'
 231                              |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
 232                              |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
 233                                                  # colors are specified like this.
 234                                                  # We'll be normalizing it.
 235                             )
 236                            )?(?=$space|\$)/sx",
 237                         $text,
 238                         $pairs,
 239                         PREG_SET_ORDER ) ) {
 240                         return '';
 241                 }
 242
 243                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 244                 $attribs = array();
 245                 foreach( $pairs as $set ) {
 246                         $attribute = strtolower( $set[1] );
 247                         if( !isset( $whitelist[$attribute] ) ) {
 248                                 continue;
 249                         }
 250                         if( !isset( $set[2] ) ) {
 251                                 # In XHTML, attributes must have a value.
 252                                 $value = $set[1];
 253                         } elseif( $set[3] != '' ) {
 254                                 # Double-quoted
 255                                 $value = Sanitizer::normalizeAttributeValue( $set[3] );
 256                         } elseif( $set[4] != '' ) {
 257                                 # Single-quoted
 258                                 $value = str_replace( '"', '&quot;',
 259                                         Sanitizer::normalizeAttributeValue( $set[4] ) );
 260                         } elseif( $set[5] != '' ) {
 261                                 # No quotes.
 262                                 $value = Sanitizer::normalizeAttributeValue( $set[5] );
 263                         } elseif( $set[6] != '' ) {
 264                                 # Illegal #XXXXXX color with no quotes.
 265                                 $value = Sanitizer::normalizeAttributeValue( $set[6] );
 266                         } else {
 267                                 wfDebugDieBacktrace( "Tag conditions not met. Something's very odd." );
 268                         }
 269
 270                         # Strip javascript "expression" from stylesheets.
 271                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 272                         if( $attribute == 'style' && preg_match(
 273                                 '/(expression|tps*:\/\/|url\\s*\().*/is',
 274                                         wfMungeToUtf8( $value ) ) ) {
 275                                 # haxx0r
 276                                 continue;
 277                         }
 278
 279                         if( !isset( $attribs[$attribute] ) ) {
 280                                 $attribs[$attribute] = "$attribute=\"$value\"";
 281                         }
 282                 }
 283                 if( empty( $attribs ) ) {
 284                         return '';
 285                 } else {
 286                         return ' ' . implode( ' ', $attribs );
 287                 }
 288         }
 289
 290         /**
 291          * Normalize whitespace and character references in an XML source-
 292          * encoded text for an attribute value.
 293          *
 294          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 295          * but note that we're not returning the value, but are returning
 296          * XML source fragments that will be slapped into output.
 297          *
 298          * @param string $text
 299          * @return string
 300          * @access private
 301          */
 302         function normalizeAttributeValue( $text ) {
 303                 return preg_replace(
 304                         '/\r\n|[\x20\x0d\x0a\x09]/',
 305                         ' ',
 306                         Sanitizer::normalizeCharReferences( $text ) );
 307         }
 308
 309         /**
 310          * Ensure that any entities and character references are legal
 311          * for XML and XHTML specifically. Any stray bits will be
 312          * &amp;-escaped to result in a valid text fragment.
 313          *
 314          * a. any named char refs must be known in XHTML
 315          * b. any numeric char refs must be legal chars, not invalid or forbidden
 316          * c. use &#x, not &#X
 317          * d. fix or reject non-valid attributes
 318          *
 319          * @param string $text
 320          * @return string
 321          * @access private
 322          */
 323         function normalizeCharReferences( $text ) {
 324                 return preg_replace_callback(
 325                         '/&([A-Za-z0-9]+);
 326                          |&\#([0-9]+);
 327                          |&\#x([0-9A-Za-z]+);
 328                          |&\#X([0-9A-Za-z]+);
 329                          |(&)/x',
 330                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 331                         $text );
 332         }
 333         /**
 334          * @param string $matches
 335          * @return string
 336          */
 337         function normalizeCharReferencesCallback( $matches ) {
 338                 $ret = null;
 339                 if( $matches[1] != '' ) {
 340                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 341                 } elseif( $matches[2] != '' ) {
 342                         $ret = Sanitizer::decCharReference( $matches[2] );
 343                 } elseif( $matches[3] != ''  ) {
 344                         $ret = Sanitizer::hexCharReference( $matches[3] );
 345                 } elseif( $matches[4] != '' ) {
 346                         $ret = Sanitizer::hexCharReference( $matches[4] );
 347                 }
 348                 if( is_null( $ret ) ) {
 349                         return htmlspecialchars( $matches[0] );
 350                 } else {
 351                         return $ret;
 352                 }
 353         }
 354
 355         /**
 356          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 357          * return the named entity reference as is. Otherwise, returns
 358          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 359          *
 360          * @param string $name
 361          * @return string
 362          */
 363         function normalizeEntity( $name ) {
 364                 # List of all named character entities defined in HTML 4.01
 365                 # http://www.w3.org/TR/html4/sgml/entities.html
 366                 static $htmlEntities = array(
 367                         'aacute' => true,
 368                         'Aacute' => true,
 369                         'acirc' => true,
 370                         'Acirc' => true,
 371                         'acute' => true,
 372                         'aelig' => true,
 373                         'AElig' => true,
 374                         'agrave' => true,
 375                         'Agrave' => true,
 376                         'alefsym' => true,
 377                         'alpha' => true,
 378                         'Alpha' => true,
 379                         'amp' => true,
 380                         'and' => true,
 381                         'ang' => true,
 382                         'apos' => true,
 383                         'aring' => true,
 384                         'Aring' => true,
 385                         'asymp' => true,
 386                         'atilde' => true,
 387                         'Atilde' => true,
 388                         'auml' => true,
 389                         'Auml' => true,
 390                         'bdquo' => true,
 391                         'beta' => true,
 392                         'Beta' => true,
 393                         'brvbar' => true,
 394                         'bull' => true,
 395                         'cap' => true,
 396                         'ccedil' => true,
 397                         'Ccedil' => true,
 398                         'cedil' => true,
 399                         'cent' => true,
 400                         'chi' => true,
 401                         'Chi' => true,
 402                         'circ' => true,
 403                         'clubs' => true,
 404                         'cong' => true,
 405                         'copy' => true,
 406                         'crarr' => true,
 407                         'cup' => true,
 408                         'curren' => true,
 409                         'dagger' => true,
 410                         'Dagger' => true,
 411                         'darr' => true,
 412                         'dArr' => true,
 413                         'deg' => true,
 414                         'delta' => true,
 415                         'Delta' => true,
 416                         'diams' => true,
 417                         'divide' => true,
 418                         'eacute' => true,
 419                         'Eacute' => true,
 420                         'ecirc' => true,
 421                         'Ecirc' => true,
 422                         'egrave' => true,
 423                         'Egrave' => true,
 424                         'empty' => true,
 425                         'emsp' => true,
 426                         'ensp' => true,
 427                         'epsilon' => true,
 428                         'Epsilon' => true,
 429                         'equiv' => true,
 430                         'eta' => true,
 431                         'Eta' => true,
 432                         'eth' => true,
 433                         'ETH' => true,
 434                         'euml' => true,
 435                         'Euml' => true,
 436                         'euro' => true,
 437                         'exist' => true,
 438                         'fnof' => true,
 439                         'forall' => true,
 440                         'frac12' => true,
 441                         'frac14' => true,
 442                         'frac34' => true,
 443                         'frasl' => true,
 444                         'gamma' => true,
 445                         'Gamma' => true,
 446                         'ge' => true,
 447                         'gt' => true,
 448                         'harr' => true,
 449                         'hArr' => true,
 450                         'hearts' => true,
 451                         'hellip' => true,
 452                         'iacute' => true,
 453                         'Iacute' => true,
 454                         'icirc' => true,
 455                         'Icirc' => true,
 456                         'iexcl' => true,
 457                         'igrave' => true,
 458                         'Igrave' => true,
 459                         'image' => true,
 460                         'infin' => true,
 461                         'int' => true,
 462                         'iota' => true,
 463                         'Iota' => true,
 464                         'iquest' => true,
 465                         'isin' => true,
 466                         'iuml' => true,
 467                         'Iuml' => true,
 468                         'kappa' => true,
 469                         'Kappa' => true,
 470                         'lambda' => true,
 471                         'Lambda' => true,
 472                         'lang' => true,
 473                         'laquo' => true,
 474                         'larr' => true,
 475                         'lArr' => true,
 476                         'lceil' => true,
 477                         'ldquo' => true,
 478                         'le' => true,
 479                         'lfloor' => true,
 480                         'lowast' => true,
 481                         'loz' => true,
 482                         'lrm' => true,
 483                         'lsaquo' => true,
 484                         'lsquo' => true,
 485                         'lt' => true,
 486                         'macr' => true,
 487                         'mdash' => true,
 488                         'micro' => true,
 489                         'middot' => true,
 490                         'minus' => true,
 491                         'mu' => true,
 492                         'Mu' => true,
 493                         'nabla' => true,
 494                         'nbsp' => true,
 495                         'ndash' => true,
 496                         'ne' => true,
 497                         'ni' => true,
 498                         'not' => true,
 499                         'notin' => true,
 500                         'nsub' => true,
 501                         'ntilde' => true,
 502                         'Ntilde' => true,
 503                         'nu' => true,
 504                         'Nu' => true,
 505                         'oacute' => true,
 506                         'Oacute' => true,
 507                         'ocirc' => true,
 508                         'Ocirc' => true,
 509                         'oelig' => true,
 510                         'OElig' => true,
 511                         'ograve' => true,
 512                         'Ograve' => true,
 513                         'oline' => true,
 514                         'omega' => true,
 515                         'Omega' => true,
 516                         'omicron' => true,
 517                         'Omicron' => true,
 518                         'oplus' => true,
 519                         'or' => true,
 520                         'ordf' => true,
 521                         'ordm' => true,
 522                         'oslash' => true,
 523                         'Oslash' => true,
 524                         'otilde' => true,
 525                         'Otilde' => true,
 526                         'otimes' => true,
 527                         'ouml' => true,
 528                         'Ouml' => true,
 529                         'para' => true,
 530                         'part' => true,
 531                         'permil' => true,
 532                         'perp' => true,
 533                         'phi' => true,
 534                         'Phi' => true,
 535                         'pi' => true,
 536                         'Pi' => true,
 537                         'piv' => true,
 538                         'plusmn' => true,
 539                         'pound' => true,
 540                         'prime' => true,
 541                         'Prime' => true,
 542                         'prod' => true,
 543                         'prop' => true,
 544                         'psi' => true,
 545                         'Psi' => true,
 546                         'quot' => true,
 547                         'radic' => true,
 548                         'rang' => true,
 549                         'raquo' => true,
 550                         'rarr' => true,
 551                         'rArr' => true,
 552                         'rceil' => true,
 553                         'rdquo' => true,
 554                         'real' => true,
 555                         'reg' => true,
 556                         'rfloor' => true,
 557                         'rho' => true,
 558                         'Rho' => true,
 559                         'rlm' => true,
 560                         'rsaquo' => true,
 561                         'rsquo' => true,
 562                         'sbquo' => true,
 563                         'scaron' => true,
 564                         'Scaron' => true,
 565                         'sdot' => true,
 566                         'sect' => true,
 567                         'shy' => true,
 568                         'sigma' => true,
 569                         'Sigma' => true,
 570                         'sigmaf' => true,
 571                         'sim' => true,
 572                         'spades' => true,
 573                         'sub' => true,
 574                         'sube' => true,
 575                         'sum' => true,
 576                         'sup' => true,
 577                         'sup1' => true,
 578                         'sup2' => true,
 579                         'sup3' => true,
 580                         'supe' => true,
 581                         'szlig' => true,
 582                         'tau' => true,
 583                         'Tau' => true,
 584                         'there4' => true,
 585                         'theta' => true,
 586                         'Theta' => true,
 587                         'thetasym' => true,
 588                         'thinsp' => true,
 589                         'thorn' => true,
 590                         'THORN' => true,
 591                         'tilde' => true,
 592                         'times' => true,
 593                         'trade' => true,
 594                         'uacute' => true,
 595                         'Uacute' => true,
 596                         'uarr' => true,
 597                         'uArr' => true,
 598                         'ucirc' => true,
 599                         'Ucirc' => true,
 600                         'ugrave' => true,
 601                         'Ugrave' => true,
 602                         'uml' => true,
 603                         'upsih' => true,
 604                         'upsilon' => true,
 605                         'Upsilon' => true,
 606                         'uuml' => true,
 607                         'Uuml' => true,
 608                         'weierp' => true,
 609                         'xi' => true,
 610                         'Xi' => true,
 611                         'yacute' => true,
 612                         'Yacute' => true,
 613                         'yen' => true,
 614                         'yuml' => true,
 615                         'Yuml' => true,
 616                         'zeta' => true,
 617                         'Zeta' => true,
 618                         'zwj' => true,
 619                         'zwnj' => true );
 620                 if( isset( $htmlEntities[$name] ) ) {
 621                         return "&$name;";
 622                 } else {
 623                         return "&amp;$name;";
 624                 }
 625         }
 626
 627         function decCharReference( $codepoint ) {
 628                 $point = IntVal( $codepoint );
 629                 if( Sanitizer::validateCodepoint( $point ) ) {
 630                         return sprintf( '&#%d;', $point );
 631                 } else {
 632                         return null;
 633                 }
 634         }
 635
 636         function hexCharReference( $codepoint ) {
 637                 $point = hexdec( $codepoint );
 638                 if( Sanitizer::validateCodepoint( $point ) ) {
 639                         return sprintf( '&#x%x;', $point );
 640                 } else {
 641                         return null;
 642                 }
 643         }
 644
 645         /**
 646          * Returns true if a given Unicode codepoint is a valid character in XML.
 647          * @param int $codepoint
 648          * @return bool
 649          */
 650         function validateCodepoint( $codepoint ) {
 651                 return ($codepoint ==    0x09)
 652                         || ($codepoint ==    0x0a)
 653                         || ($codepoint ==    0x0d)
 654                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 655                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 656                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 657         }
 658
 659         /**
 660          * Fetch the whitelist of acceptable attributes for a given
 661          * element name.
 662          *
 663          * @param string $element
 664          * @return array
 665          */
 666         function attributeWhitelist( $element ) {
 667                 $list = Sanitizer::setupAttributeWhitelist();
 668                 return isset( $list[$element] )
 669                         ? $list[$element]
 670                         : array();
 671         }
 672
 673         /**
 674          * @return array
 675          */
 676         function setupAttributeWhitelist() {
 677                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 678                 $block = array_merge( $common, array( 'align' ) );
 679                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 680                 $tablecell = array( 'abbr',
 681                                     'axis',
 682                                     'headers',
 683                                     'scope',
 684                                     'rowspan',
 685                                     'colspan',
 686                                     'nowrap', # deprecated
 687                                     'width', # deprecated
 688                                     'height' # deprecated
 689                                     );
 690
 691                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 692                 # See: http://www.w3.org/TR/html4/
 693                 $whitelist = array (
 694                         # 7.5.4
 695                         'div'        => $block,
 696                         'center'     => $common, # deprecated
 697                         'span'       => $block, # ??
 698
 699                         # 7.5.5
 700                         'h1'         => $block,
 701                         'h2'         => $block,
 702                         'h3'         => $block,
 703                         'h4'         => $block,
 704                         'h5'         => $block,
 705                         'h6'         => $block,
 706
 707                         # 7.5.6
 708                         # address
 709
 710                         # 8.2.4
 711                         # bdo
 712
 713                         # 9.2.1
 714                         'em'         => $common,
 715                         'strong'     => $common,
 716                         'cite'       => $common,
 717                         # dfn
 718                         'code'       => $common,
 719                         # samp
 720                         # kbd
 721                         'var'        => $common,
 722                         # abbr
 723                         # acronym
 724
 725                         # 9.2.2
 726                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 727                         # q
 728
 729                         # 9.2.3
 730                         'sub'        => $common,
 731                         'sup'        => $common,
 732
 733                         # 9.3.1
 734                         'p'          => $block,
 735
 736                         # 9.3.2
 737                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 738
 739                         # 9.3.4
 740                         'pre'        => array_merge( $common, array( 'width' ) ),
 741
 742                         # 9.4
 743                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 744                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 745
 746                         # 10.2
 747                         'ul'         => array_merge( $common, array( 'type' ) ),
 748                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 749                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 750
 751                         # 10.3
 752                         'dl'         => $common,
 753                         'dd'         => $common,
 754                         'dt'         => $common,
 755
 756                         # 11.2.1
 757                         'table'      => array_merge( $common,
 758                                                                 array( 'summary', 'width', 'border', 'frame',
 759                                                                                          'rules', 'cellspacing', 'cellpadding',
 760                                                                                          'align', 'bgcolor', 'frame', 'rules',
 761                                                                                          'border' ) ),
 762
 763                         # 11.2.2
 764                         'caption'    => array_merge( $common, array( 'align' ) ),
 765
 766                         # 11.2.3
 767                         'thead'      => array_merge( $common, $tablealign ),
 768                         'tfoot'      => array_merge( $common, $tablealign ),
 769                         'tbody'      => array_merge( $common, $tablealign ),
 770
 771                         # 11.2.4
 772                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 773                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 774
 775                         # 11.2.5
 776                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 777
 778                         # 11.2.6
 779                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 780                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 781
 782                         # 15.2.1
 783                         'tt'         => $common,
 784                         'b'          => $common,
 785                         'i'          => $common,
 786                         'big'        => $common,
 787                         'small'      => $common,
 788                         'strike'     => $common,
 789                         's'          => $common,
 790                         'u'          => $common,
 791
 792                         # 15.2.2
 793                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 794                         # basefont
 795
 796                         # 15.3
 797                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 798
 799                         # XHTML Ruby annotation text module, simple ruby only.
 800                         # http://www.w3c.org/TR/ruby/
 801                         'ruby'       => $common,
 802                         # rbc
 803                         # rtc
 804                         'rb'         => $common,
 805                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 806                         'rp'         => $common,
 807                         );
 808                 return $whitelist;
 809         }
 810
 811         /**
 812          * Take a fragment of (potentially invalid) HTML and return
 813          * a version with any tags removed, encoded suitably for literal
 814          * inclusion in an attribute value.
 815          *
 816          * @param string $text HTML fragment
 817          * @return string
 818          */
 819         function stripAllTags( $text ) {
 820                 # Actual <tags>
 821                 $text = preg_replace( '/<[^>]*>/', '', $text );
 822
 823                 # Normalize &entities and whitespace
 824                 $text = Sanitizer::normalizeAttributeValue( $text );
 825
 826                 # Will be placed into "double-quoted" attributes,
 827                 # make sure remaining bits are safe.
 828                 $text = str_replace(
 829                         array('<', '>', '"'),
 830                         array('&lt;', '&gt;', '&quot;'),
 831                         $text );
 832
 833                 return $text;
 834         }
 835
 836 }
 837
 838 ?>