includes/Sanitizer.php

   1 <?php
   2
   3 /**
   4  * (X)HTML sanitizer for MediaWiki
   5  *
   6  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   7  * http://www.mediawiki.org/
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write to the Free Software Foundation, Inc.,
  21  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  22  * http://www.gnu.org/copyleft/gpl.html
  23  *
  24  * @package MediaWiki
  25  * @subpackage Parser
  26  */
  27
  28 class Sanitizer {
  29         /**
  30          * Cleans up HTML, removes dangerous tags and attributes, and
  31          * removes HTML comments
  32          * @access private
  33          * @param string $text
  34          * @return string
  35          */
  36         function removeHTMLtags( $text ) {
  37                 global $wgUseTidy, $wgUserHtml;
  38                 $fname = 'Parser::removeHTMLtags';
  39                 wfProfileIn( $fname );
  40
  41                 if( $wgUserHtml ) {
  42                         $htmlpairs = array( # Tags that must be closed
  43                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
  44                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
  45                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
  46                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
  47                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
  48                         );
  49                         $htmlsingle = array(
  50                                 'br', 'hr', 'li', 'dt', 'dd'
  51                         );
  52                         $htmlnest = array( # Tags that can be nested--??
  53                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
  54                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
  55                         );
  56                         $tabletags = array( # Can only appear inside table
  57                                 'td', 'th', 'tr'
  58                         );
  59                 } else {
  60                         $htmlpairs = array();
  61                         $htmlsingle = array();
  62                         $htmlnest = array();
  63                         $tabletags = array();
  64                 }
  65
  66                 $htmlsingle = array_merge( $tabletags, $htmlsingle );
  67                 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
  68
  69                 # Remove HTML comments
  70                 $text = Sanitizer::removeHTMLcomments( $text );
  71
  72                 $bits = explode( '<', $text );
  73                 $text = array_shift( $bits );
  74                 if(!$wgUseTidy) {
  75                         $tagstack = array(); $tablestack = array();
  76                         foreach ( $bits as $x ) {
  77                                 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
  78                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
  79                                 $x, $regs );
  80                                 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
  81                                 error_reporting( $prev );
  82
  83                                 $badtag = 0 ;
  84                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
  85                                         # Check our stack
  86                                         if ( $slash ) {
  87                                                 # Closing a tag...
  88                                                 if ( ! in_array( $t, $htmlsingle ) &&
  89                                                 ( $ot = @array_pop( $tagstack ) ) != $t ) {
  90                                                         @array_push( $tagstack, $ot );
  91                                                         $badtag = 1;
  92                                                 } else {
  93                                                         if ( $t == 'table' ) {
  94                                                                 $tagstack = array_pop( $tablestack );
  95                                                         }
  96                                                         $newparams = '';
  97                                                 }
  98                                         } else {
  99                                                 # Keep track for later
 100                                                 if ( in_array( $t, $tabletags ) &&
 101                                                 ! in_array( 'table', $tagstack ) ) {
 102                                                         $badtag = 1;
 103                                                 } else if ( in_array( $t, $tagstack ) &&
 104                                                 ! in_array ( $t , $htmlnest ) ) {
 105                                                         $badtag = 1 ;
 106                                                 } else if ( ! in_array( $t, $htmlsingle ) ) {
 107                                                         if ( $t == 'table' ) {
 108                                                                 array_push( $tablestack, $tagstack );
 109                                                                 $tagstack = array();
 110                                                         }
 111                                                         array_push( $tagstack, $t );
 112                                                 }
 113                                                 # Strip non-approved attributes from the tag
 114                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 115                                         }
 116                                         if ( ! $badtag ) {
 117                                                 $rest = str_replace( '>', '&gt;', $rest );
 118                                                 $text .= "<$slash$t$newparams$brace$rest";
 119                                                 continue;
 120                                         }
 121                                 }
 122                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 123                         }
 124                         # Close off any remaining tags
 125                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 126                                 $text .= "</$t>\n";
 127                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 128                         }
 129                 } else {
 130                         # this might be possible using tidy itself
 131                         foreach ( $bits as $x ) {
 132                                 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
 133                                 $x, $regs );
 134                                 @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
 135                                 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
 136                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 137                                         $rest = str_replace( '>', '&gt;', $rest );
 138                                         $text .= "<$slash$t$newparams$brace$rest";
 139                                 } else {
 140                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 141                                 }
 142                         }
 143                 }
 144                 wfProfileOut( $fname );
 145                 return $text;
 146         }
 147
 148         /**
 149          * Remove '<!--', '-->', and everything between.
 150          * To avoid leaving blank lines, when a comment is both preceded
 151          * and followed by a newline (ignoring spaces), trim leading and
 152          * trailing spaces and one of the newlines.
 153          *
 154          * @access private
 155          * @param string $text
 156          * @return string
 157          */
 158         function removeHTMLcomments( $text ) {
 159                 $fname='Parser::removeHTMLcomments';
 160                 wfProfileIn( $fname );
 161                 while (($start = strpos($text, '<!--')) !== false) {
 162                         $end = strpos($text, '-->', $start + 4);
 163                         if ($end === false) {
 164                                 # Unterminated comment; bail out
 165                                 break;
 166                         }
 167
 168                         $end += 3;
 169
 170                         # Trim space and newline if the comment is both
 171                         # preceded and followed by a newline
 172                         $spaceStart = max($start - 1, 0);
 173                         $spaceLen = $end - $spaceStart;
 174                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 175                                 $spaceStart--;
 176                                 $spaceLen++;
 177                         }
 178                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 179                                 $spaceLen++;
 180                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 181                                 # Remove the comment, leading and trailing
 182                                 # spaces, and leave only one newline.
 183                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 184                         }
 185                         else {
 186                                 # Remove just the comment.
 187                                 $text = substr_replace($text, '', $start, $end - $start);
 188                         }
 189                 }
 190                 wfProfileOut( $fname );
 191                 return $text;
 192         }
 193
 194         /**
 195          * Take a tag soup fragment listing an HTML element's attributes
 196          * and normalize it to well-formed XML, discarding unwanted attributes.
 197          *
 198          * - Normalizes attribute names to lowercase
 199          * - Discards attributes not on a whitelist for the given element
 200          * - Turns broken or invalid entities into plaintext
 201          * - Double-quotes all attribute values
 202          * - Attributes without values are given the name as attribute
 203          * - Double attributes are discarded
 204          * - Unsafe style attributes are discarded
 205          * - Prepends space if there are attributes.
 206          *
 207          * @param string $text
 208          * @param string $element
 209          * @return string
 210          *
 211          * @todo Check for legal values where the DTD limits things.
 212          * @todo Check for unique id attribute :P
 213          */
 214         function fixTagAttributes( $text, $element ) {
 215                 if( trim( $text ) == '' ) {
 216                         return '';
 217                 }
 218
 219                 # Unquoted attribute
 220                 # Since we quote this later, this can be anything distinguishable
 221                 # from the end of the attribute
 222                 $attrib = '[A-Za-z0-9]';
 223                 $space = '[\x09\x0a\x0d\x20]';
 224                 if( !preg_match_all(
 225                         "/(?:^|$space)($attrib+)
 226                           ($space*=$space*
 227                             (?:
 228                              # The attribute value: quoted or alone
 229                               \"([^<\"]*)\"
 230                              | '([^<']*)'
 231                              |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
 232                              |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
 233                                                  # colors are specified like this.
 234                                                  # We'll be normalizing it.
 235                             )
 236                            )?(?=$space|\$)/sx",
 237                         $text,
 238                         $pairs,
 239                         PREG_SET_ORDER ) ) {
 240                         return '';
 241                 }
 242
 243                 $whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
 244                 $attribs = array();
 245                 foreach( $pairs as $set ) {
 246                         $attribute = strtolower( $set[1] );
 247                         if( !isset( $whitelist[$attribute] ) ) {
 248                                 continue;
 249                         } elseif( isset( $set[6] ) ) {
 250                                 # Illegal #XXXXXX color with no quotes.
 251                                 $value = Sanitizer::normalizeAttributeValue( $set[6] );
 252                         } elseif( isset( $set[5] ) ) {
 253                                 # No quotes.
 254                                 $value = Sanitizer::normalizeAttributeValue( $set[5] );
 255                         } elseif( isset( $set[4] ) ) {
 256                                 # Single-quoted
 257                                 $value = str_replace( '"', '&quot;',
 258                                         Sanitizer::normalizeAttributeValue( $set[4] ) );
 259                         } elseif( isset( $set[3] ) ) {
 260                                 # Double-quoted
 261                                 $value = Sanitizer::normalizeAttributeValue( $set[3] );
 262                         } elseif( !isset( $set[2] ) ) {
 263                                 # In XHTML, attributes must have a value.
 264                                 $value = $set[1];
 265                         } else {
 266                                 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
 267                         }
 268
 269                         # Strip javascript "expression" from stylesheets.
 270                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 271                         if( $attribute == 'style' && preg_match(
 272                                 '/(expression|tps*:\/\/|url\\s*\().*/is',
 273                                         wfMungeToUtf8( $value ) ) ) {
 274                                 # haxx0r
 275                                 continue;
 276                         }
 277
 278                         if( !isset( $attribs[$attribute] ) ) {
 279                                 $attribs[$attribute] = "$attribute=\"$value\"";
 280                         }
 281                 }
 282                 if( empty( $attribs ) ) {
 283                         return '';
 284                 } else {
 285                         return ' ' . implode( ' ', $attribs );
 286                 }
 287         }
 288
 289         /**
 290          * Normalize whitespace and character references in an XML source-
 291          * encoded text for an attribute value.
 292          *
 293          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 294          * but note that we're not returning the value, but are returning
 295          * XML source fragments that will be slapped into output.
 296          *
 297          * @param string $text
 298          * @return string
 299          * @access private
 300          */
 301         function normalizeAttributeValue( $text ) {
 302                 return preg_replace(
 303                         '/\r\n|[\x20\x0d\x0a\x09]/',
 304                         ' ',
 305                         Sanitizer::normalizeCharReferences( $text ) );
 306         }
 307
 308         /**
 309          * Ensure that any entities and character references are legal
 310          * for XML and XHTML specifically. Any stray bits will be
 311          * &amp;-escaped to result in a valid text fragment.
 312          *
 313          * a. any named char refs must be known in XHTML
 314          * b. any numeric char refs must be legal chars, not invalid or forbidden
 315          * c. use &#x, not &#X
 316          * d. fix or reject non-valid attributes
 317          *
 318          * @param string $text
 319          * @return string
 320          * @access private
 321          */
 322         function normalizeCharReferences( $text ) {
 323                 return preg_replace_callback(
 324                         '/&([A-Za-z0-9]+);
 325                          |&\#([0-9]+);
 326                          |&\#x([0-9A-Za-z]+);
 327                          |&\#X([0-9A-Za-z]+);
 328                          |(&)/x',
 329                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 330                         $text );
 331         }
 332         /**
 333          * @param string $matches
 334          * @return string
 335          */
 336         function normalizeCharReferencesCallback( $matches ) {
 337                 $ret = null;
 338                 if( $matches[1] != '' ) {
 339                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 340                 } elseif( $matches[2] != '' ) {
 341                         $ret = Sanitizer::decCharReference( $matches[2] );
 342                 } elseif( $matches[3] != ''  ) {
 343                         $ret = Sanitizer::hexCharReference( $matches[3] );
 344                 } elseif( $matches[4] != '' ) {
 345                         $ret = Sanitizer::hexCharReference( $matches[4] );
 346                 }
 347                 if( is_null( $ret ) ) {
 348                         return htmlspecialchars( $matches[0] );
 349                 } else {
 350                         return $ret;
 351                 }
 352         }
 353
 354         /**
 355          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 356          * return the named entity reference as is. Otherwise, returns
 357          * HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 358          *
 359          * @param string $name
 360          * @return string
 361          */
 362         function normalizeEntity( $name ) {
 363                 # List of all named character entities defined in HTML 4.01
 364                 # http://www.w3.org/TR/html4/sgml/entities.html
 365                 static $htmlEntities = array(
 366                         'aacute' => true,
 367                         'Aacute' => true,
 368                         'acirc' => true,
 369                         'Acirc' => true,
 370                         'acute' => true,
 371                         'aelig' => true,
 372                         'AElig' => true,
 373                         'agrave' => true,
 374                         'Agrave' => true,
 375                         'alefsym' => true,
 376                         'alpha' => true,
 377                         'Alpha' => true,
 378                         'amp' => true,
 379                         'and' => true,
 380                         'ang' => true,
 381                         'apos' => true,
 382                         'aring' => true,
 383                         'Aring' => true,
 384                         'asymp' => true,
 385                         'atilde' => true,
 386                         'Atilde' => true,
 387                         'auml' => true,
 388                         'Auml' => true,
 389                         'bdquo' => true,
 390                         'beta' => true,
 391                         'Beta' => true,
 392                         'brvbar' => true,
 393                         'bull' => true,
 394                         'cap' => true,
 395                         'ccedil' => true,
 396                         'Ccedil' => true,
 397                         'cedil' => true,
 398                         'cent' => true,
 399                         'chi' => true,
 400                         'Chi' => true,
 401                         'circ' => true,
 402                         'clubs' => true,
 403                         'cong' => true,
 404                         'copy' => true,
 405                         'crarr' => true,
 406                         'cup' => true,
 407                         'curren' => true,
 408                         'dagger' => true,
 409                         'Dagger' => true,
 410                         'darr' => true,
 411                         'dArr' => true,
 412                         'deg' => true,
 413                         'delta' => true,
 414                         'Delta' => true,
 415                         'diams' => true,
 416                         'divide' => true,
 417                         'eacute' => true,
 418                         'Eacute' => true,
 419                         'ecirc' => true,
 420                         'Ecirc' => true,
 421                         'egrave' => true,
 422                         'Egrave' => true,
 423                         'empty' => true,
 424                         'emsp' => true,
 425                         'ensp' => true,
 426                         'epsilon' => true,
 427                         'Epsilon' => true,
 428                         'equiv' => true,
 429                         'eta' => true,
 430                         'Eta' => true,
 431                         'eth' => true,
 432                         'ETH' => true,
 433                         'euml' => true,
 434                         'Euml' => true,
 435                         'euro' => true,
 436                         'exist' => true,
 437                         'fnof' => true,
 438                         'forall' => true,
 439                         'frac12' => true,
 440                         'frac14' => true,
 441                         'frac34' => true,
 442                         'frasl' => true,
 443                         'gamma' => true,
 444                         'Gamma' => true,
 445                         'ge' => true,
 446                         'gt' => true,
 447                         'harr' => true,
 448                         'hArr' => true,
 449                         'hearts' => true,
 450                         'hellip' => true,
 451                         'iacute' => true,
 452                         'Iacute' => true,
 453                         'icirc' => true,
 454                         'Icirc' => true,
 455                         'iexcl' => true,
 456                         'igrave' => true,
 457                         'Igrave' => true,
 458                         'image' => true,
 459                         'infin' => true,
 460                         'int' => true,
 461                         'iota' => true,
 462                         'Iota' => true,
 463                         'iquest' => true,
 464                         'isin' => true,
 465                         'iuml' => true,
 466                         'Iuml' => true,
 467                         'kappa' => true,
 468                         'Kappa' => true,
 469                         'lambda' => true,
 470                         'Lambda' => true,
 471                         'lang' => true,
 472                         'laquo' => true,
 473                         'larr' => true,
 474                         'lArr' => true,
 475                         'lceil' => true,
 476                         'ldquo' => true,
 477                         'le' => true,
 478                         'lfloor' => true,
 479                         'lowast' => true,
 480                         'loz' => true,
 481                         'lrm' => true,
 482                         'lsaquo' => true,
 483                         'lsquo' => true,
 484                         'lt' => true,
 485                         'macr' => true,
 486                         'mdash' => true,
 487                         'micro' => true,
 488                         'middot' => true,
 489                         'minus' => true,
 490                         'mu' => true,
 491                         'Mu' => true,
 492                         'nabla' => true,
 493                         'nbsp' => true,
 494                         'ndash' => true,
 495                         'ne' => true,
 496                         'ni' => true,
 497                         'not' => true,
 498                         'notin' => true,
 499                         'nsub' => true,
 500                         'ntilde' => true,
 501                         'Ntilde' => true,
 502                         'nu' => true,
 503                         'Nu' => true,
 504                         'oacute' => true,
 505                         'Oacute' => true,
 506                         'ocirc' => true,
 507                         'Ocirc' => true,
 508                         'oelig' => true,
 509                         'OElig' => true,
 510                         'ograve' => true,
 511                         'Ograve' => true,
 512                         'oline' => true,
 513                         'omega' => true,
 514                         'Omega' => true,
 515                         'omicron' => true,
 516                         'Omicron' => true,
 517                         'oplus' => true,
 518                         'or' => true,
 519                         'ordf' => true,
 520                         'ordm' => true,
 521                         'oslash' => true,
 522                         'Oslash' => true,
 523                         'otilde' => true,
 524                         'Otilde' => true,
 525                         'otimes' => true,
 526                         'ouml' => true,
 527                         'Ouml' => true,
 528                         'para' => true,
 529                         'part' => true,
 530                         'permil' => true,
 531                         'perp' => true,
 532                         'phi' => true,
 533                         'Phi' => true,
 534                         'pi' => true,
 535                         'Pi' => true,
 536                         'piv' => true,
 537                         'plusmn' => true,
 538                         'pound' => true,
 539                         'prime' => true,
 540                         'Prime' => true,
 541                         'prod' => true,
 542                         'prop' => true,
 543                         'psi' => true,
 544                         'Psi' => true,
 545                         'quot' => true,
 546                         'radic' => true,
 547                         'rang' => true,
 548                         'raquo' => true,
 549                         'rarr' => true,
 550                         'rArr' => true,
 551                         'rceil' => true,
 552                         'rdquo' => true,
 553                         'real' => true,
 554                         'reg' => true,
 555                         'rfloor' => true,
 556                         'rho' => true,
 557                         'Rho' => true,
 558                         'rlm' => true,
 559                         'rsaquo' => true,
 560                         'rsquo' => true,
 561                         'sbquo' => true,
 562                         'scaron' => true,
 563                         'Scaron' => true,
 564                         'sdot' => true,
 565                         'sect' => true,
 566                         'shy' => true,
 567                         'sigma' => true,
 568                         'Sigma' => true,
 569                         'sigmaf' => true,
 570                         'sim' => true,
 571                         'spades' => true,
 572                         'sub' => true,
 573                         'sube' => true,
 574                         'sum' => true,
 575                         'sup' => true,
 576                         'sup1' => true,
 577                         'sup2' => true,
 578                         'sup3' => true,
 579                         'supe' => true,
 580                         'szlig' => true,
 581                         'tau' => true,
 582                         'Tau' => true,
 583                         'there4' => true,
 584                         'theta' => true,
 585                         'Theta' => true,
 586                         'thetasym' => true,
 587                         'thinsp' => true,
 588                         'thorn' => true,
 589                         'THORN' => true,
 590                         'tilde' => true,
 591                         'times' => true,
 592                         'trade' => true,
 593                         'uacute' => true,
 594                         'Uacute' => true,
 595                         'uarr' => true,
 596                         'uArr' => true,
 597                         'ucirc' => true,
 598                         'Ucirc' => true,
 599                         'ugrave' => true,
 600                         'Ugrave' => true,
 601                         'uml' => true,
 602                         'upsih' => true,
 603                         'upsilon' => true,
 604                         'Upsilon' => true,
 605                         'uuml' => true,
 606                         'Uuml' => true,
 607                         'weierp' => true,
 608                         'xi' => true,
 609                         'Xi' => true,
 610                         'yacute' => true,
 611                         'Yacute' => true,
 612                         'yen' => true,
 613                         'yuml' => true,
 614                         'Yuml' => true,
 615                         'zeta' => true,
 616                         'Zeta' => true,
 617                         'zwj' => true,
 618                         'zwnj' => true );
 619                 if( isset( $htmlEntities[$name] ) ) {
 620                         return "&$name;";
 621                 } else {
 622                         return "&amp;$name;";
 623                 }
 624         }
 625
 626         function decCharReference( $codepoint ) {
 627                 $point = IntVal( $codepoint );
 628                 if( Sanitizer::validateCodepoint( $point ) ) {
 629                         return sprintf( '&#%d;', $point );
 630                 } else {
 631                         return null;
 632                 }
 633         }
 634
 635         function hexCharReference( $codepoint ) {
 636                 $point = hexdec( $codepoint );
 637                 if( Sanitizer::validateCodepoint( $point ) ) {
 638                         return sprintf( '&#x%x;', $point );
 639                 } else {
 640                         return null;
 641                 }
 642         }
 643
 644         /**
 645          * Returns true if a given Unicode codepoint is a valid character in XML.
 646          * @param int $codepoint
 647          * @return bool
 648          */
 649         function validateCodepoint( $codepoint ) {
 650                 return ($codepoint ==    0x09)
 651                         || ($codepoint ==    0x0a)
 652                         || ($codepoint ==    0x0d)
 653                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
 654                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
 655                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
 656         }
 657
 658         /**
 659          * Fetch the whitelist of acceptable attributes for a given
 660          * element name.
 661          *
 662          * @param string $element
 663          * @return array
 664          */
 665         function attributeWhitelist( $element ) {
 666                 $list = Sanitizer::setupAttributeWhitelist();
 667                 return isset( $list[$element] )
 668                         ? $list[$element]
 669                         : array();
 670         }
 671
 672         /**
 673          * @return array
 674          */
 675         function setupAttributeWhitelist() {
 676                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 677                 $block = array_merge( $common, array( 'align' ) );
 678                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
 679                 $tablecell = array( 'abbr',
 680                                     'axis',
 681                                     'headers',
 682                                     'scope',
 683                                     'rowspan',
 684                                     'colspan',
 685                                     'nowrap', # deprecated
 686                                     'width', # deprecated
 687                                     'height' # deprecated
 688                                     );
 689
 690                 # Numbers refer to sections in HTML 4.01 standard describing the element.
 691                 # See: http://www.w3.org/TR/html4/
 692                 $whitelist = array (
 693                         # 7.5.4
 694                         'div'        => $block,
 695                         'center'     => $common, # deprecated
 696                         'span'       => $block, # ??
 697
 698                         # 7.5.5
 699                         'h1'         => $block,
 700                         'h2'         => $block,
 701                         'h3'         => $block,
 702                         'h4'         => $block,
 703                         'h5'         => $block,
 704                         'h6'         => $block,
 705
 706                         # 7.5.6
 707                         # address
 708
 709                         # 8.2.4
 710                         # bdo
 711
 712                         # 9.2.1
 713                         'em'         => $common,
 714                         'strong'     => $common,
 715                         'cite'       => $common,
 716                         # dfn
 717                         'code'       => $common,
 718                         # samp
 719                         # kbd
 720                         'var'        => $common,
 721                         # abbr
 722                         # acronym
 723
 724                         # 9.2.2
 725                         'blockquote' => array_merge( $common, array( 'cite' ) ),
 726                         # q
 727
 728                         # 9.2.3
 729                         'sub'        => $common,
 730                         'sup'        => $common,
 731
 732                         # 9.3.1
 733                         'p'          => $block,
 734
 735                         # 9.3.2
 736                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
 737
 738                         # 9.3.4
 739                         'pre'        => array_merge( $common, array( 'width' ) ),
 740
 741                         # 9.4
 742                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 743                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
 744
 745                         # 10.2
 746                         'ul'         => array_merge( $common, array( 'type' ) ),
 747                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
 748                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
 749
 750                         # 10.3
 751                         'dl'         => $common,
 752                         'dd'         => $common,
 753                         'dt'         => $common,
 754
 755                         # 11.2.1
 756                         'table'      => array_merge( $common,
 757                                                                 array( 'summary', 'width', 'border', 'frame',
 758                                                                                          'rules', 'cellspacing', 'cellpadding',
 759                                                                                          'align', 'bgcolor', 'frame', 'rules',
 760                                                                                          'border' ) ),
 761
 762                         # 11.2.2
 763                         'caption'    => array_merge( $common, array( 'align' ) ),
 764
 765                         # 11.2.3
 766                         'thead'      => array_merge( $common, $tablealign ),
 767                         'tfoot'      => array_merge( $common, $tablealign ),
 768                         'tbody'      => array_merge( $common, $tablealign ),
 769
 770                         # 11.2.4
 771                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 772                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
 773
 774                         # 11.2.5
 775                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
 776
 777                         # 11.2.6
 778                         'td'         => array_merge( $common, $tablecell, $tablealign ),
 779                         'th'         => array_merge( $common, $tablecell, $tablealign ),
 780
 781                         # 15.2.1
 782                         'tt'         => $common,
 783                         'b'          => $common,
 784                         'i'          => $common,
 785                         'big'        => $common,
 786                         'small'      => $common,
 787                         'strike'     => $common,
 788                         's'          => $common,
 789                         'u'          => $common,
 790
 791                         # 15.2.2
 792                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
 793                         # basefont
 794
 795                         # 15.3
 796                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 797
 798                         # XHTML Ruby annotation text module, simple ruby only.
 799                         # http://www.w3c.org/TR/ruby/
 800                         'ruby'       => $common,
 801                         # rbc
 802                         # rtc
 803                         'rb'         => $common,
 804                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
 805                         'rp'         => $common,
 806                         );
 807                 return $whitelist;
 808         }
 809
 810         /**
 811          * Take a fragment of (potentially invalid) HTML and return
 812          * a version with any tags removed, encoded suitably for literal
 813          * inclusion in an attribute value.
 814          *
 815          * @param string $text HTML fragment
 816          * @return string
 817          */
 818         function stripAllTags( $text ) {
 819                 # Actual <tags>
 820                 $text = preg_replace( '/<[^>]*>/', '', $text );
 821
 822                 # Normalize &entities and whitespace
 823                 $text = Sanitizer::normalizeAttributeValue( $text );
 824
 825                 # Will be placed into "double-quoted" attributes,
 826                 # make sure remaining bits are safe.
 827                 $text = str_replace(
 828                         array('<', '>', '"'),
 829                         array('&lt;', '&gt;', '&quot;'),
 830                         $text );
 831
 832                 return $text;
 833         }
 834
 835 }
 836
 837 ?>