includes/Sanitizer.php

   1 <?php
   2 /**
   3  * XHTML sanitizer for MediaWiki
   4  *
   5  * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
   6  * http://www.mediawiki.org/
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with this program; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21  * http://www.gnu.org/copyleft/gpl.html
  22  *
  23  * @file
  24  * @ingroup Parser
  25  */
  26
  27 /**
  28  * Regular expression to match various types of character references in
  29  * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  30  */
  31 define( 'MW_CHAR_REFS_REGEX',
  32         '/&([A-Za-z0-9\x80-\xff]+);
  33          |&\#([0-9]+);
  34          |&\#x([0-9A-Za-z]+);
  35          |&\#X([0-9A-Za-z]+);
  36          |(&)/x' );
  37
  38 /**
  39  * Regular expression to match HTML/XML attribute pairs within a tag.
  40  * Allows some... latitude.
  41  * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  42  */
  43 $attrib = '[A-Za-z0-9]';
  44 $space = '[\x09\x0a\x0d\x20]';
  45 define( 'MW_ATTRIBS_REGEX',
  46         "/(?:^|$space)($attrib+)
  47           ($space*=$space*
  48                 (?:
  49                  # The attribute value: quoted or alone
  50                   \"([^<\"]*)\"
  51                  | '([^<']*)'
  52                  |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  53                  |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  54                                                          # colors are specified like this.
  55                                                          # We'll be normalizing it.
  56                 )
  57            )?(?=$space|\$)/sx" );
  58
  59 /**
  60  * List of all named character entities defined in HTML 4.01
  61  * http://www.w3.org/TR/html4/sgml/entities.html
  62  * @private
  63  */
  64 global $wgHtmlEntities;
  65 $wgHtmlEntities = array(
  66         'Aacute'   => 193,
  67         'aacute'   => 225,
  68         'Acirc'    => 194,
  69         'acirc'    => 226,
  70         'acute'    => 180,
  71         'AElig'    => 198,
  72         'aelig'    => 230,
  73         'Agrave'   => 192,
  74         'agrave'   => 224,
  75         'alefsym'  => 8501,
  76         'Alpha'    => 913,
  77         'alpha'    => 945,
  78         'amp'      => 38,
  79         'and'      => 8743,
  80         'ang'      => 8736,
  81         'Aring'    => 197,
  82         'aring'    => 229,
  83         'asymp'    => 8776,
  84         'Atilde'   => 195,
  85         'atilde'   => 227,
  86         'Auml'     => 196,
  87         'auml'     => 228,
  88         'bdquo'    => 8222,
  89         'Beta'     => 914,
  90         'beta'     => 946,
  91         'brvbar'   => 166,
  92         'bull'     => 8226,
  93         'cap'      => 8745,
  94         'Ccedil'   => 199,
  95         'ccedil'   => 231,
  96         'cedil'    => 184,
  97         'cent'     => 162,
  98         'Chi'      => 935,
  99         'chi'      => 967,
 100         'circ'     => 710,
 101         'clubs'    => 9827,
 102         'cong'     => 8773,
 103         'copy'     => 169,
 104         'crarr'    => 8629,
 105         'cup'      => 8746,
 106         'curren'   => 164,
 107         'dagger'   => 8224,
 108         'Dagger'   => 8225,
 109         'darr'     => 8595,
 110         'dArr'     => 8659,
 111         'deg'      => 176,
 112         'Delta'    => 916,
 113         'delta'    => 948,
 114         'diams'    => 9830,
 115         'divide'   => 247,
 116         'Eacute'   => 201,
 117         'eacute'   => 233,
 118         'Ecirc'    => 202,
 119         'ecirc'    => 234,
 120         'Egrave'   => 200,
 121         'egrave'   => 232,
 122         'empty'    => 8709,
 123         'emsp'     => 8195,
 124         'ensp'     => 8194,
 125         'Epsilon'  => 917,
 126         'epsilon'  => 949,
 127         'equiv'    => 8801,
 128         'Eta'      => 919,
 129         'eta'      => 951,
 130         'ETH'      => 208,
 131         'eth'      => 240,
 132         'Euml'     => 203,
 133         'euml'     => 235,
 134         'euro'     => 8364,
 135         'exist'    => 8707,
 136         'fnof'     => 402,
 137         'forall'   => 8704,
 138         'frac12'   => 189,
 139         'frac14'   => 188,
 140         'frac34'   => 190,
 141         'frasl'    => 8260,
 142         'Gamma'    => 915,
 143         'gamma'    => 947,
 144         'ge'       => 8805,
 145         'gt'       => 62,
 146         'harr'     => 8596,
 147         'hArr'     => 8660,
 148         'hearts'   => 9829,
 149         'hellip'   => 8230,
 150         'Iacute'   => 205,
 151         'iacute'   => 237,
 152         'Icirc'    => 206,
 153         'icirc'    => 238,
 154         'iexcl'    => 161,
 155         'Igrave'   => 204,
 156         'igrave'   => 236,
 157         'image'    => 8465,
 158         'infin'    => 8734,
 159         'int'      => 8747,
 160         'Iota'     => 921,
 161         'iota'     => 953,
 162         'iquest'   => 191,
 163         'isin'     => 8712,
 164         'Iuml'     => 207,
 165         'iuml'     => 239,
 166         'Kappa'    => 922,
 167         'kappa'    => 954,
 168         'Lambda'   => 923,
 169         'lambda'   => 955,
 170         'lang'     => 9001,
 171         'laquo'    => 171,
 172         'larr'     => 8592,
 173         'lArr'     => 8656,
 174         'lceil'    => 8968,
 175         'ldquo'    => 8220,
 176         'le'       => 8804,
 177         'lfloor'   => 8970,
 178         'lowast'   => 8727,
 179         'loz'      => 9674,
 180         'lrm'      => 8206,
 181         'lsaquo'   => 8249,
 182         'lsquo'    => 8216,
 183         'lt'       => 60,
 184         'macr'     => 175,
 185         'mdash'    => 8212,
 186         'micro'    => 181,
 187         'middot'   => 183,
 188         'minus'    => 8722,
 189         'Mu'       => 924,
 190         'mu'       => 956,
 191         'nabla'    => 8711,
 192         'nbsp'     => 160,
 193         'ndash'    => 8211,
 194         'ne'       => 8800,
 195         'ni'       => 8715,
 196         'not'      => 172,
 197         'notin'    => 8713,
 198         'nsub'     => 8836,
 199         'Ntilde'   => 209,
 200         'ntilde'   => 241,
 201         'Nu'       => 925,
 202         'nu'       => 957,
 203         'Oacute'   => 211,
 204         'oacute'   => 243,
 205         'Ocirc'    => 212,
 206         'ocirc'    => 244,
 207         'OElig'    => 338,
 208         'oelig'    => 339,
 209         'Ograve'   => 210,
 210         'ograve'   => 242,
 211         'oline'    => 8254,
 212         'Omega'    => 937,
 213         'omega'    => 969,
 214         'Omicron'  => 927,
 215         'omicron'  => 959,
 216         'oplus'    => 8853,
 217         'or'       => 8744,
 218         'ordf'     => 170,
 219         'ordm'     => 186,
 220         'Oslash'   => 216,
 221         'oslash'   => 248,
 222         'Otilde'   => 213,
 223         'otilde'   => 245,
 224         'otimes'   => 8855,
 225         'Ouml'     => 214,
 226         'ouml'     => 246,
 227         'para'     => 182,
 228         'part'     => 8706,
 229         'permil'   => 8240,
 230         'perp'     => 8869,
 231         'Phi'      => 934,
 232         'phi'      => 966,
 233         'Pi'       => 928,
 234         'pi'       => 960,
 235         'piv'      => 982,
 236         'plusmn'   => 177,
 237         'pound'    => 163,
 238         'prime'    => 8242,
 239         'Prime'    => 8243,
 240         'prod'     => 8719,
 241         'prop'     => 8733,
 242         'Psi'      => 936,
 243         'psi'      => 968,
 244         'quot'     => 34,
 245         'radic'    => 8730,
 246         'rang'     => 9002,
 247         'raquo'    => 187,
 248         'rarr'     => 8594,
 249         'rArr'     => 8658,
 250         'rceil'    => 8969,
 251         'rdquo'    => 8221,
 252         'real'     => 8476,
 253         'reg'      => 174,
 254         'rfloor'   => 8971,
 255         'Rho'      => 929,
 256         'rho'      => 961,
 257         'rlm'      => 8207,
 258         'rsaquo'   => 8250,
 259         'rsquo'    => 8217,
 260         'sbquo'    => 8218,
 261         'Scaron'   => 352,
 262         'scaron'   => 353,
 263         'sdot'     => 8901,
 264         'sect'     => 167,
 265         'shy'      => 173,
 266         'Sigma'    => 931,
 267         'sigma'    => 963,
 268         'sigmaf'   => 962,
 269         'sim'      => 8764,
 270         'spades'   => 9824,
 271         'sub'      => 8834,
 272         'sube'     => 8838,
 273         'sum'      => 8721,
 274         'sup'      => 8835,
 275         'sup1'     => 185,
 276         'sup2'     => 178,
 277         'sup3'     => 179,
 278         'supe'     => 8839,
 279         'szlig'    => 223,
 280         'Tau'      => 932,
 281         'tau'      => 964,
 282         'there4'   => 8756,
 283         'Theta'    => 920,
 284         'theta'    => 952,
 285         'thetasym' => 977,
 286         'thinsp'   => 8201,
 287         'THORN'    => 222,
 288         'thorn'    => 254,
 289         'tilde'    => 732,
 290         'times'    => 215,
 291         'trade'    => 8482,
 292         'Uacute'   => 218,
 293         'uacute'   => 250,
 294         'uarr'     => 8593,
 295         'uArr'     => 8657,
 296         'Ucirc'    => 219,
 297         'ucirc'    => 251,
 298         'Ugrave'   => 217,
 299         'ugrave'   => 249,
 300         'uml'      => 168,
 301         'upsih'    => 978,
 302         'Upsilon'  => 933,
 303         'upsilon'  => 965,
 304         'Uuml'     => 220,
 305         'uuml'     => 252,
 306         'weierp'   => 8472,
 307         'Xi'       => 926,
 308         'xi'       => 958,
 309         'Yacute'   => 221,
 310         'yacute'   => 253,
 311         'yen'      => 165,
 312         'Yuml'     => 376,
 313         'yuml'     => 255,
 314         'Zeta'     => 918,
 315         'zeta'     => 950,
 316         'zwj'      => 8205,
 317         'zwnj'     => 8204 );
 318
 319 /**
 320  * Character entity aliases accepted by MediaWiki
 321  */
 322 global $wgHtmlEntityAliases;
 323 $wgHtmlEntityAliases = array(
 324         'רלמ' => 'rlm',
 325         'رلم' => 'rlm',
 326 );
 327
 328
 329 /**
 330  * XHTML sanitizer for MediaWiki
 331  * @ingroup Parser
 332  */
 333 class Sanitizer {
 334         /**
 335          * Cleans up HTML, removes dangerous tags and attributes, and
 336          * removes HTML comments
 337          * @private
 338          * @param string $text
 339          * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
 340          * @param array $args for the processing callback
 341          * @return string
 342          */
 343         static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array() ) {
 344                 global $wgUseTidy;
 345
 346                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
 347                         $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
 348
 349                 wfProfileIn( __METHOD__ );
 350
 351                 if ( !$staticInitialised ) {
 352
 353                         $htmlpairs = array_merge( $extratags, array( # Tags that must be closed
 354                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
 355                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
 356                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
 357                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
 358                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
 359                         ) );
 360                         $htmlsingle = array(
 361                                 'br', 'hr', 'li', 'dt', 'dd'
 362                         );
 363                         $htmlsingleonly = array( # Elements that cannot have close tags
 364                                 'br', 'hr'
 365                         );
 366                         $htmlnest = array( # Tags that can be nested--??
 367                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
 368                                 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
 369                         );
 370                         $tabletags = array( # Can only appear inside table, we will close them
 371                                 'td', 'th', 'tr',
 372                         );
 373                         $htmllist = array( # Tags used by list
 374                                 'ul','ol',
 375                         );
 376                         $listtags = array( # Tags that can appear in a list
 377                                 'li',
 378                         );
 379
 380                         $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
 381                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
 382
 383                         # Convert them all to hashtables for faster lookup
 384                         $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
 385                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
 386                         foreach ( $vars as $var ) {
 387                                 $$var = array_flip( $$var );
 388                         }
 389                         $staticInitialised = true;
 390                 }
 391
 392                 # Remove HTML comments
 393                 $text = Sanitizer::removeHTMLcomments( $text );
 394                 $bits = explode( '<', $text );
 395                 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
 396                 if(!$wgUseTidy) {
 397                         $tagstack = $tablestack = array();
 398                         foreach ( $bits as $x ) {
 399                                 $regs = array();
 400                                 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
 401                                         list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 402                                 } else {
 403                                         $slash = $t = $params = $brace = $rest = null;
 404                                 }
 405
 406                                 $badtag = 0 ;
 407                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 408                                         # Check our stack
 409                                         if ( $slash ) {
 410                                                 # Closing a tag...
 411                                                 if( isset( $htmlsingleonly[$t] ) ) {
 412                                                         $badtag = 1;
 413                                                 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
 414                                                         if ( isset( $htmlsingleallowed[$ot] ) ) {
 415                                                                 # Pop all elements with an optional close tag
 416                                                                 # and see if we find a match below them
 417                                                                 $optstack = array();
 418                                                                 array_push ($optstack, $ot);
 419                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
 420                                                                                 isset( $htmlsingleallowed[$ot] ) )
 421                                                                 {
 422                                                                         array_push ($optstack, $ot);
 423                                                                 }
 424                                                                 if ( $t != $ot ) {
 425                                                                         # No match. Push the optinal elements back again
 426                                                                         $badtag = 1;
 427                                                                         while ( $ot = @array_pop( $optstack ) ) {
 428                                                                                 array_push( $tagstack, $ot );
 429                                                                         }
 430                                                                 }
 431                                                         } else {
 432                                                                 @array_push( $tagstack, $ot );
 433                                                                 # <li> can be nested in <ul> or <ol>, skip those cases:
 434                                                                 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
 435                                                                         $badtag = 1;
 436                                                                 }
 437                                                         }
 438                                                 } else {
 439                                                         if ( $t == 'table' ) {
 440                                                                 $tagstack = array_pop( $tablestack );
 441                                                         }
 442                                                 }
 443                                                 $newparams = '';
 444                                         } else {
 445                                                 # Keep track for later
 446                                                 if ( isset( $tabletags[$t] ) &&
 447                                                 ! in_array( 'table', $tagstack ) ) {
 448                                                         $badtag = 1;
 449                                                 } else if ( in_array( $t, $tagstack ) &&
 450                                                 ! isset( $htmlnest [$t ] ) ) {
 451                                                         $badtag = 1 ;
 452                                                 # Is it a self closed htmlpair ? (bug 5487)
 453                                                 } else if( $brace == '/>' &&
 454                                                 isset( $htmlpairs[$t] ) ) {
 455                                                         $badtag = 1;
 456                                                 } elseif( isset( $htmlsingleonly[$t] ) ) {
 457                                                         # Hack to force empty tag for uncloseable elements
 458                                                         $brace = '/>';
 459                                                 } else if( isset( $htmlsingle[$t] ) ) {
 460                                                         # Hack to not close $htmlsingle tags
 461                                                         $brace = NULL;
 462                                                 } else if( isset( $tabletags[$t] )
 463                                                 &&  in_array($t ,$tagstack) ) {
 464                                                         // New table tag but forgot to close the previous one
 465                                                         $text .= "</$t>";
 466                                                 } else {
 467                                                         if ( $t == 'table' ) {
 468                                                                 array_push( $tablestack, $tagstack );
 469                                                                 $tagstack = array();
 470                                                         }
 471                                                         array_push( $tagstack, $t );
 472                                                 }
 473
 474                                                 # Replace any variables or template parameters with
 475                                                 # plaintext results.
 476                                                 if( is_callable( $processCallback ) ) {
 477                                                         call_user_func_array( $processCallback, array( &$params, $args ) );
 478                                                 }
 479
 480                                                 # Strip non-approved attributes from the tag
 481                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
 482                                         }
 483                                         if ( ! $badtag ) {
 484                                                 $rest = str_replace( '>', '&gt;', $rest );
 485                                                 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
 486                                                 $text .= "<$slash$t$newparams$close>$rest";
 487                                                 continue;
 488                                         }
 489                                 }
 490                                 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 491                         }
 492                         # Close off any remaining tags
 493                         while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
 494                                 $text .= "</$t>\n";
 495                                 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
 496                         }
 497                 } else {
 498                         # this might be possible using tidy itself
 499                         foreach ( $bits as $x ) {
 500                                 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
 501                                 $x, $regs );
 502                                 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
 503                                 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
 504                                         if( is_callable( $processCallback ) ) {
 505                                                 call_user_func_array( $processCallback, array( &$params, $args ) );
 506                                         }
 507                                         $newparams = Sanitizer::fixTagAttributes( $params, $t );
 508                                         $rest = str_replace( '>', '&gt;', $rest );
 509                                         $text .= "<$slash$t$newparams$brace$rest";
 510                                 } else {
 511                                         $text .= '&lt;' . str_replace( '>', '&gt;', $x);
 512                                 }
 513                         }
 514                 }
 515                 wfProfileOut( __METHOD__ );
 516                 return $text;
 517         }
 518
 519         /**
 520          * Remove '<!--', '-->', and everything between.
 521          * To avoid leaving blank lines, when a comment is both preceded
 522          * and followed by a newline (ignoring spaces), trim leading and
 523          * trailing spaces and one of the newlines.
 524          *
 525          * @private
 526          * @param string $text
 527          * @return string
 528          */
 529         static function removeHTMLcomments( $text ) {
 530                 wfProfileIn( __METHOD__ );
 531                 while (($start = strpos($text, '<!--')) !== false) {
 532                         $end = strpos($text, '-->', $start + 4);
 533                         if ($end === false) {
 534                                 # Unterminated comment; bail out
 535                                 break;
 536                         }
 537
 538                         $end += 3;
 539
 540                         # Trim space and newline if the comment is both
 541                         # preceded and followed by a newline
 542                         $spaceStart = max($start - 1, 0);
 543                         $spaceLen = $end - $spaceStart;
 544                         while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
 545                                 $spaceStart--;
 546                                 $spaceLen++;
 547                         }
 548                         while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
 549                                 $spaceLen++;
 550                         if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
 551                                 # Remove the comment, leading and trailing
 552                                 # spaces, and leave only one newline.
 553                                 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
 554                         }
 555                         else {
 556                                 # Remove just the comment.
 557                                 $text = substr_replace($text, '', $start, $end - $start);
 558                         }
 559                 }
 560                 wfProfileOut( __METHOD__ );
 561                 return $text;
 562         }
 563
 564         /**
 565          * Take an array of attribute names and values and normalize or discard
 566          * illegal values for the given element type.
 567          *
 568          * - Discards attributes not on a whitelist for the given element
 569          * - Unsafe style attributes are discarded
 570          * - Invalid id attributes are reencoded
 571          *
 572          * @param array $attribs
 573          * @param string $element
 574          * @return array
 575          *
 576          * @todo Check for legal values where the DTD limits things.
 577          * @todo Check for unique id attribute :P
 578          */
 579         static function validateTagAttributes( $attribs, $element ) {
 580                 return Sanitizer::validateAttributes( $attribs,
 581                         Sanitizer::attributeWhitelist( $element ) );
 582         }
 583
 584         /**
 585          * Take an array of attribute names and values and normalize or discard
 586          * illegal values for the given whitelist.
 587          *
 588          * - Discards attributes not the given whitelist
 589          * - Unsafe style attributes are discarded
 590          * - Invalid id attributes are reencoded
 591          *
 592          * @param array $attribs
 593          * @param array $whitelist list of allowed attribute names
 594          * @return array
 595          *
 596          * @todo Check for legal values where the DTD limits things.
 597          * @todo Check for unique id attribute :P
 598          */
 599         static function validateAttributes( $attribs, $whitelist ) {
 600                 $whitelist = array_flip( $whitelist );
 601                 $out = array();
 602                 foreach( $attribs as $attribute => $value ) {
 603                         if( !isset( $whitelist[$attribute] ) ) {
 604                                 continue;
 605                         }
 606                         # Strip javascript "expression" from stylesheets.
 607                         # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
 608                         if( $attribute == 'style' ) {
 609                                 $value = Sanitizer::checkCss( $value );
 610                                 if( $value === false ) {
 611                                         # haxx0r
 612                                         continue;
 613                                 }
 614                         }
 615
 616                         if ( $attribute === 'id' )
 617                                 $value = Sanitizer::escapeId( $value );
 618
 619                         // If this attribute was previously set, override it.
 620                         // Output should only have one attribute of each name.
 621                         $out[$attribute] = $value;
 622                 }
 623                 return $out;
 624         }
 625
 626         /**
 627          * Merge two sets of HTML attributes.  Conflicting items in the second set
 628          * will override those in the first, except for 'class' attributes which
 629          * will be combined (if they're both strings).
 630          *
 631          * @todo implement merging for other attributes such as style
 632          * @param array $a
 633          * @param array $b
 634          * @return array
 635          */
 636         static function mergeAttributes( $a, $b ) {
 637                 $out = array_merge( $a, $b );
 638                 if( isset( $a['class'] ) && isset( $b['class'] )
 639                 && is_string( $a['class'] ) && is_string( $b['class'] )
 640                 && $a['class'] !== $b['class'] ) {
 641                         $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
 642                                 -1, PREG_SPLIT_NO_EMPTY );
 643                         $out['class'] = implode( ' ', array_unique( $classes ) );
 644                 }
 645                 return $out;
 646         }
 647
 648         /**
 649          * Pick apart some CSS and check it for forbidden or unsafe structures.
 650          * Returns a sanitized string, or false if it was just too evil.
 651          *
 652          * Currently URL references, 'expression', 'tps' are forbidden.
 653          *
 654          * @param string $value
 655          * @return mixed
 656          */
 657         static function checkCss( $value ) {
 658                 $stripped = Sanitizer::decodeCharReferences( $value );
 659
 660                 // Remove any comments; IE gets token splitting wrong
 661                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
 662
 663                 $value = $stripped;
 664
 665                 // ... and continue checks
 666                 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
 667                         'codepointToUtf8(hexdec("$1"))', $stripped );
 668                 $stripped = str_replace( '\\', '', $stripped );
 669                 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
 670                                 $stripped ) ) {
 671                         # haxx0r
 672                         return false;
 673                 }
 674
 675                 return $value;
 676         }
 677
 678         /**
 679          * Take a tag soup fragment listing an HTML element's attributes
 680          * and normalize it to well-formed XML, discarding unwanted attributes.
 681          * Output is safe for further wikitext processing, with escaping of
 682          * values that could trigger problems.
 683          *
 684          * - Normalizes attribute names to lowercase
 685          * - Discards attributes not on a whitelist for the given element
 686          * - Turns broken or invalid entities into plaintext
 687          * - Double-quotes all attribute values
 688          * - Attributes without values are given the name as attribute
 689          * - Double attributes are discarded
 690          * - Unsafe style attributes are discarded
 691          * - Prepends space if there are attributes.
 692          *
 693          * @param string $text
 694          * @param string $element
 695          * @return string
 696          */
 697         static function fixTagAttributes( $text, $element ) {
 698                 if( trim( $text ) == '' ) {
 699                         return '';
 700                 }
 701
 702                 $stripped = Sanitizer::validateTagAttributes(
 703                         Sanitizer::decodeTagAttributes( $text ), $element );
 704
 705                 $attribs = array();
 706                 foreach( $stripped as $attribute => $value ) {
 707                         $encAttribute = htmlspecialchars( $attribute );
 708                         $encValue = Sanitizer::safeEncodeAttribute( $value );
 709
 710                         $attribs[] = "$encAttribute=\"$encValue\"";
 711                 }
 712                 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
 713         }
 714
 715         /**
 716          * Encode an attribute value for HTML output.
 717          * @param $text
 718          * @return HTML-encoded text fragment
 719          */
 720         static function encodeAttribute( $text ) {
 721                 $encValue = htmlspecialchars( $text, ENT_QUOTES );
 722
 723                 // Whitespace is normalized during attribute decoding,
 724                 // so if we've been passed non-spaces we must encode them
 725                 // ahead of time or they won't be preserved.
 726                 $encValue = strtr( $encValue, array(
 727                         "\n" => '&#10;',
 728                         "\r" => '&#13;',
 729                         "\t" => '&#9;',
 730                 ) );
 731
 732                 return $encValue;
 733         }
 734
 735         /**
 736          * Encode an attribute value for HTML tags, with extra armoring
 737          * against further wiki processing.
 738          * @param $text
 739          * @return HTML-encoded text fragment
 740          */
 741         static function safeEncodeAttribute( $text ) {
 742                 $encValue = Sanitizer::encodeAttribute( $text );
 743
 744                 # Templates and links may be expanded in later parsing,
 745                 # creating invalid or dangerous output. Suppress this.
 746                 $encValue = strtr( $encValue, array(
 747                         '<'    => '&lt;',   // This should never happen,
 748                         '>'    => '&gt;',   // we've received invalid input
 749                         '"'    => '&quot;', // which should have been escaped.
 750                         '{'    => '&#123;',
 751                         '['    => '&#91;',
 752                         "''"   => '&#39;&#39;',
 753                         'ISBN' => '&#73;SBN',
 754                         'RFC'  => '&#82;FC',
 755                         'PMID' => '&#80;MID',
 756                         '|'    => '&#124;',
 757                         '__'   => '&#95;_',
 758                 ) );
 759
 760                 # Stupid hack
 761                 $encValue = preg_replace_callback(
 762                         '/(' . wfUrlProtocols() . ')/',
 763                         array( 'Sanitizer', 'armorLinksCallback' ),
 764                         $encValue );
 765                 return $encValue;
 766         }
 767
 768         /**
 769          * Given a value escape it so that it can be used in an id attribute and
 770          * return it, this does not validate the value however (see first link)
 771          *
 772          * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
 773          *                                                          in the id and
 774          *                                                          name attributes
 775          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
 776          *
 777          * @param string $id      Id to validate
 778          * @param mixed  $options String or array of strings (default is array()):
 779          *   'noninitial': This is a non-initial fragment of an id, not a full id,
 780          *       so don't prepend an 'x' if the first character isn't valid at the
 781          *       beginning of an id.
 782          * @return string
 783          */
 784         static function escapeId( $id, $options = array() ) {
 785                 $options = (array)$options;
 786                 static $replace = array(
 787                         '%3A' => ':',
 788                         '%' => '.'
 789                 );
 790
 791                 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
 792                 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 793
 794                 if( preg_match( '/[^a-zA-Z]/', $id[0] )
 795                 && !in_array( 'noninitial', $options ) )  {
 796                         // Initial character must be a letter!
 797                         $id = "x$id";
 798                 }
 799                 return $id;
 800         }
 801
 802         /**
 803          * Given a value, escape it so that it can be used as a CSS class and
 804          * return it.
 805          *
 806          * @todo For extra validity, input should be validated UTF-8.
 807          *
 808          * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
 809          *
 810          * @param string $class
 811          * @return string
 812          */
 813         static function escapeClass( $class ) {
 814                 // Convert ugly stuff to underscores and kill underscores in ugly places
 815                 return rtrim(preg_replace(
 816                         array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
 817                         '_',
 818                         $class ), '_');
 819         }
 820
 821         /**
 822          * Given HTML input, escape with htmlspecialchars but un-escape entites.
 823          * This allows (generally harmless) entities like &nbsp; to survive.
 824          *
 825          * @param  string $html String to escape
 826          * @return string Escaped input
 827          */
 828         static function escapeHtmlAllowEntities( $html ) {
 829                 # It seems wise to escape ' as well as ", as a matter of course.  Can't
 830                 # hurt.
 831                 $html = htmlspecialchars( $html, ENT_QUOTES );
 832                 $html = str_replace( '&amp;', '&', $html );
 833                 $html = Sanitizer::normalizeCharReferences( $html );
 834                 return $html;
 835         }
 836
 837         /**
 838          * Regex replace callback for armoring links against further processing.
 839          * @param array $matches
 840          * @return string
 841          * @private
 842          */
 843         private static function armorLinksCallback( $matches ) {
 844                 return str_replace( ':', '&#58;', $matches[1] );
 845         }
 846
 847         /**
 848          * Return an associative array of attribute names and values from
 849          * a partial tag string. Attribute names are forces to lowercase,
 850          * character references are decoded to UTF-8 text.
 851          *
 852          * @param string
 853          * @return array
 854          */
 855         public static function decodeTagAttributes( $text ) {
 856                 $attribs = array();
 857
 858                 if( trim( $text ) == '' ) {
 859                         return $attribs;
 860                 }
 861
 862                 $pairs = array();
 863                 if( !preg_match_all(
 864                         MW_ATTRIBS_REGEX,
 865                         $text,
 866                         $pairs,
 867                         PREG_SET_ORDER ) ) {
 868                         return $attribs;
 869                 }
 870
 871                 foreach( $pairs as $set ) {
 872                         $attribute = strtolower( $set[1] );
 873                         $value = Sanitizer::getTagAttributeCallback( $set );
 874
 875                         // Normalize whitespace
 876                         $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
 877                         $value = trim( $value );
 878
 879                         // Decode character references
 880                         $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
 881                 }
 882                 return $attribs;
 883         }
 884
 885         /**
 886          * Pick the appropriate attribute value from a match set from the
 887          * MW_ATTRIBS_REGEX matches.
 888          *
 889          * @param array $set
 890          * @return string
 891          * @private
 892          */
 893         private static function getTagAttributeCallback( $set ) {
 894                 if( isset( $set[6] ) ) {
 895                         # Illegal #XXXXXX color with no quotes.
 896                         return $set[6];
 897                 } elseif( isset( $set[5] ) ) {
 898                         # No quotes.
 899                         return $set[5];
 900                 } elseif( isset( $set[4] ) ) {
 901                         # Single-quoted
 902                         return $set[4];
 903                 } elseif( isset( $set[3] ) ) {
 904                         # Double-quoted
 905                         return $set[3];
 906                 } elseif( !isset( $set[2] ) ) {
 907                         # In XHTML, attributes must have a value.
 908                         # For 'reduced' form, return explicitly the attribute name here.
 909                         return $set[1];
 910                 } else {
 911                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
 912                 }
 913         }
 914
 915         /**
 916          * Normalize whitespace and character references in an XML source-
 917          * encoded text for an attribute value.
 918          *
 919          * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
 920          * but note that we're not returning the value, but are returning
 921          * XML source fragments that will be slapped into output.
 922          *
 923          * @param string $text
 924          * @return string
 925          * @private
 926          */
 927         private static function normalizeAttributeValue( $text ) {
 928                 return str_replace( '"', '&quot;',
 929                         self::normalizeWhitespace(
 930                                 Sanitizer::normalizeCharReferences( $text ) ) );
 931         }
 932
 933         private static function normalizeWhitespace( $text ) {
 934                 return preg_replace(
 935                         '/\r\n|[\x20\x0d\x0a\x09]/',
 936                         ' ',
 937                         $text );
 938         }
 939
 940         /**
 941          * Ensure that any entities and character references are legal
 942          * for XML and XHTML specifically. Any stray bits will be
 943          * &amp;-escaped to result in a valid text fragment.
 944          *
 945          * a. any named char refs must be known in XHTML
 946          * b. any numeric char refs must be legal chars, not invalid or forbidden
 947          * c. use &#x, not &#X
 948          * d. fix or reject non-valid attributes
 949          *
 950          * @param string $text
 951          * @return string
 952          * @private
 953          */
 954         static function normalizeCharReferences( $text ) {
 955                 return preg_replace_callback(
 956                         MW_CHAR_REFS_REGEX,
 957                         array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
 958                         $text );
 959         }
 960         /**
 961          * @param string $matches
 962          * @return string
 963          */
 964         static function normalizeCharReferencesCallback( $matches ) {
 965                 $ret = null;
 966                 if( $matches[1] != '' ) {
 967                         $ret = Sanitizer::normalizeEntity( $matches[1] );
 968                 } elseif( $matches[2] != '' ) {
 969                         $ret = Sanitizer::decCharReference( $matches[2] );
 970                 } elseif( $matches[3] != ''  ) {
 971                         $ret = Sanitizer::hexCharReference( $matches[3] );
 972                 } elseif( $matches[4] != '' ) {
 973                         $ret = Sanitizer::hexCharReference( $matches[4] );
 974                 }
 975                 if( is_null( $ret ) ) {
 976                         return htmlspecialchars( $matches[0] );
 977                 } else {
 978                         return $ret;
 979                 }
 980         }
 981
 982         /**
 983          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
 984          * return the named entity reference as is. If the entity is a
 985          * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
 986          * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
 987          *
 988          * @param string $name
 989          * @return string
 990          * @static
 991          */
 992         static function normalizeEntity( $name ) {
 993                 global $wgHtmlEntities, $wgHtmlEntityAliases;
 994                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
 995                         return "&{$wgHtmlEntityAliases[$name]};";
 996                 } elseif( isset( $wgHtmlEntities[$name] ) ) {
 997                         return "&$name;";
 998                 } else {
 999                         return "&amp;$name;";
1000                 }
1001         }
1002
1003         static function decCharReference( $codepoint ) {
1004                 $point = intval( $codepoint );
1005                 if( Sanitizer::validateCodepoint( $point ) ) {
1006                         return sprintf( '&#%d;', $point );
1007                 } else {
1008                         return null;
1009                 }
1010         }
1011
1012         static function hexCharReference( $codepoint ) {
1013                 $point = hexdec( $codepoint );
1014                 if( Sanitizer::validateCodepoint( $point ) ) {
1015                         return sprintf( '&#x%x;', $point );
1016                 } else {
1017                         return null;
1018                 }
1019         }
1020
1021         /**
1022          * Returns true if a given Unicode codepoint is a valid character in XML.
1023          * @param int $codepoint
1024          * @return bool
1025          */
1026         private static function validateCodepoint( $codepoint ) {
1027                 return ($codepoint ==    0x09)
1028                         || ($codepoint ==    0x0a)
1029                         || ($codepoint ==    0x0d)
1030                         || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
1031                         || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
1032                         || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1033         }
1034
1035         /**
1036          * Decode any character references, numeric or named entities,
1037          * in the text and return a UTF-8 string.
1038          *
1039          * @param string $text
1040          * @return string
1041          * @public
1042          * @static
1043          */
1044         public static function decodeCharReferences( $text ) {
1045                 return preg_replace_callback(
1046                         MW_CHAR_REFS_REGEX,
1047                         array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1048                         $text );
1049         }
1050
1051         /**
1052          * @param string $matches
1053          * @return string
1054          */
1055         static function decodeCharReferencesCallback( $matches ) {
1056                 if( $matches[1] != '' ) {
1057                         return Sanitizer::decodeEntity( $matches[1] );
1058                 } elseif( $matches[2] != '' ) {
1059                         return  Sanitizer::decodeChar( intval( $matches[2] ) );
1060                 } elseif( $matches[3] != ''  ) {
1061                         return  Sanitizer::decodeChar( hexdec( $matches[3] ) );
1062                 } elseif( $matches[4] != '' ) {
1063                         return  Sanitizer::decodeChar( hexdec( $matches[4] ) );
1064                 }
1065                 # Last case should be an ampersand by itself
1066                 return $matches[0];
1067         }
1068
1069         /**
1070          * Return UTF-8 string for a codepoint if that is a valid
1071          * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1072          * @param int $codepoint
1073          * @return string
1074          * @private
1075          */
1076         static function decodeChar( $codepoint ) {
1077                 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1078                         return codepointToUtf8( $codepoint );
1079                 } else {
1080                         return UTF8_REPLACEMENT;
1081                 }
1082         }
1083
1084         /**
1085          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1086          * return the UTF-8 encoding of that character. Otherwise, returns
1087          * pseudo-entity source (eg &foo;)
1088          *
1089          * @param string $name
1090          * @return string
1091          */
1092         static function decodeEntity( $name ) {
1093                 global $wgHtmlEntities, $wgHtmlEntityAliases;
1094                 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1095                         $name = $wgHtmlEntityAliases[$name];
1096                 }
1097                 if( isset( $wgHtmlEntities[$name] ) ) {
1098                         return codepointToUtf8( $wgHtmlEntities[$name] );
1099                 } else {
1100                         return "&$name;";
1101                 }
1102         }
1103
1104         /**
1105          * Fetch the whitelist of acceptable attributes for a given
1106          * element name.
1107          *
1108          * @param string $element
1109          * @return array
1110          */
1111         static function attributeWhitelist( $element ) {
1112                 static $list;
1113                 if( !isset( $list ) ) {
1114                         $list = Sanitizer::setupAttributeWhitelist();
1115                 }
1116                 return isset( $list[$element] )
1117                         ? $list[$element]
1118                         : array();
1119         }
1120
1121         /**
1122          * Foreach array key (an allowed HTML element), return an array
1123          * of allowed attributes
1124          * @return array
1125          */
1126         static function setupAttributeWhitelist() {
1127                 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
1128                 $block = array_merge( $common, array( 'align' ) );
1129                 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1130                 $tablecell = array( 'abbr',
1131                                     'axis',
1132                                     'headers',
1133                                     'scope',
1134                                     'rowspan',
1135                                     'colspan',
1136                                     'nowrap', # deprecated
1137                                     'width',  # deprecated
1138                                     'height', # deprecated
1139                                     'bgcolor' # deprecated
1140                                     );
1141
1142                 # Numbers refer to sections in HTML 4.01 standard describing the element.
1143                 # See: http://www.w3.org/TR/html4/
1144                 $whitelist = array (
1145                         # 7.5.4
1146                         'div'        => $block,
1147                         'center'     => $common, # deprecated
1148                         'span'       => $block, # ??
1149
1150                         # 7.5.5
1151                         'h1'         => $block,
1152                         'h2'         => $block,
1153                         'h3'         => $block,
1154                         'h4'         => $block,
1155                         'h5'         => $block,
1156                         'h6'         => $block,
1157
1158                         # 7.5.6
1159                         # address
1160
1161                         # 8.2.4
1162                         # bdo
1163
1164                         # 9.2.1
1165                         'em'         => $common,
1166                         'strong'     => $common,
1167                         'cite'       => $common,
1168                         # dfn
1169                         'code'       => $common,
1170                         # samp
1171                         # kbd
1172                         'var'        => $common,
1173                         # abbr
1174                         # acronym
1175
1176                         # 9.2.2
1177                         'blockquote' => array_merge( $common, array( 'cite' ) ),
1178                         # q
1179
1180                         # 9.2.3
1181                         'sub'        => $common,
1182                         'sup'        => $common,
1183
1184                         # 9.3.1
1185                         'p'          => $block,
1186
1187                         # 9.3.2
1188                         'br'         => array( 'id', 'class', 'title', 'style', 'clear' ),
1189
1190                         # 9.3.4
1191                         'pre'        => array_merge( $common, array( 'width' ) ),
1192
1193                         # 9.4
1194                         'ins'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1195                         'del'        => array_merge( $common, array( 'cite', 'datetime' ) ),
1196
1197                         # 10.2
1198                         'ul'         => array_merge( $common, array( 'type' ) ),
1199                         'ol'         => array_merge( $common, array( 'type', 'start' ) ),
1200                         'li'         => array_merge( $common, array( 'type', 'value' ) ),
1201
1202                         # 10.3
1203                         'dl'         => $common,
1204                         'dd'         => $common,
1205                         'dt'         => $common,
1206
1207                         # 11.2.1
1208                         'table'      => array_merge( $common,
1209                                                                 array( 'summary', 'width', 'border', 'frame',
1210                                                                                 'rules', 'cellspacing', 'cellpadding',
1211                                                                                 'align', 'bgcolor',
1212                                                                 ) ),
1213
1214                         # 11.2.2
1215                         'caption'    => array_merge( $common, array( 'align' ) ),
1216
1217                         # 11.2.3
1218                         'thead'      => array_merge( $common, $tablealign ),
1219                         'tfoot'      => array_merge( $common, $tablealign ),
1220                         'tbody'      => array_merge( $common, $tablealign ),
1221
1222                         # 11.2.4
1223                         'colgroup'   => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1224                         'col'        => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1225
1226                         # 11.2.5
1227                         'tr'         => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1228
1229                         # 11.2.6
1230                         'td'         => array_merge( $common, $tablecell, $tablealign ),
1231                         'th'         => array_merge( $common, $tablecell, $tablealign ),
1232
1233                         # 13.2
1234                         # Not usually allowed, but may be used for extension-style hooks
1235                         # such as <math> when it is rasterized
1236                         'img'        => array_merge( $common, array( 'alt' ) ),
1237
1238                         # 15.2.1
1239                         'tt'         => $common,
1240                         'b'          => $common,
1241                         'i'          => $common,
1242                         'big'        => $common,
1243                         'small'      => $common,
1244                         'strike'     => $common,
1245                         's'          => $common,
1246                         'u'          => $common,
1247
1248                         # 15.2.2
1249                         'font'       => array_merge( $common, array( 'size', 'color', 'face' ) ),
1250                         # basefont
1251
1252                         # 15.3
1253                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1254
1255                         # XHTML Ruby annotation text module, simple ruby only.
1256                         # http://www.w3c.org/TR/ruby/
1257                         'ruby'       => $common,
1258                         # rbc
1259                         # rtc
1260                         'rb'         => $common,
1261                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
1262                         'rp'         => $common,
1263
1264                         # MathML root element, where used for extensions
1265                         # 'title' may not be 100% valid here; it's XHTML
1266                         # http://www.w3.org/TR/REC-MathML/
1267                         'math'       => array( 'class', 'style', 'id', 'title' ),
1268                         );
1269                 return $whitelist;
1270         }
1271
1272         /**
1273          * Take a fragment of (potentially invalid) HTML and return
1274          * a version with any tags removed, encoded as plain text.
1275          *
1276          * Warning: this return value must be further escaped for literal
1277          * inclusion in HTML output as of 1.10!
1278          *
1279          * @param string $text HTML fragment
1280          * @return string
1281          */
1282         static function stripAllTags( $text ) {
1283                 # Actual <tags>
1284                 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1285
1286                 # Normalize &entities and whitespace
1287                 $text = self::decodeCharReferences( $text );
1288                 $text = self::normalizeWhitespace( $text );
1289
1290                 return $text;
1291         }
1292
1293         /**
1294          * Hack up a private DOCTYPE with HTML's standard entity declarations.
1295          * PHP 4 seemed to know these if you gave it an HTML doctype, but
1296          * PHP 5.1 doesn't.
1297          *
1298          * Use for passing XHTML fragments to PHP's XML parsing functions
1299          *
1300          * @return string
1301          * @static
1302          */
1303         static function hackDocType() {
1304                 global $wgHtmlEntities;
1305                 $out = "<!DOCTYPE html [\n";
1306                 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1307                         $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1308                 }
1309                 $out .= "]>\n";
1310                 return $out;
1311         }
1312
1313         static function cleanUrl( $url ) {
1314                 # Normalize any HTML entities in input. They will be
1315                 # re-escaped by makeExternalLink().
1316                 $url = Sanitizer::decodeCharReferences( $url );
1317
1318                 # Escape any control characters introduced by the above step
1319                 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1320
1321                 # Validate hostname portion
1322                 $matches = array();
1323                 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1324                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
1325
1326                         // Characters that will be ignored in IDNs.
1327                         // http://tools.ietf.org/html/3454#section-3.1
1328                         // Strip them before further processing so blacklists and such work.
1329                         $strip = "/
1330                                 \\s|          # general whitespace
1331                                 \xc2\xad|     # 00ad SOFT HYPHEN
1332                                 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1333                                 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1334                                 \xe2\x81\xa0| # 2060 WORD JOINER
1335                                 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1336                                 \xcd\x8f|     # 034f COMBINING GRAPHEME JOINER
1337                                 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1338                                 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1339                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1340                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1341                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1342                                 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1343                                 /xuD";
1344
1345                         $host = preg_replace( $strip, '', $host );
1346
1347                         // @fixme: validate hostnames here
1348
1349                         return $protocol . $host . $rest;
1350                 } else {
1351                         return $url;
1352                 }
1353         }
1354
1355 }