ab521e829a49d944975961553e49e432222f4fa2
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2 /**
3 * XHTML sanitizer for MediaWiki
4 *
5 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Parser
25 */
26
27 /**
28 * Regular expression to match various types of character references in
29 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
30 */
31 define( 'MW_CHAR_REFS_REGEX',
32 '/&([A-Za-z0-9\x80-\xff]+);
33 |&\#([0-9]+);
34 |&\#x([0-9A-Za-z]+);
35 |&\#X([0-9A-Za-z]+);
36 |(&)/x' );
37
38 /**
39 * Regular expression to match HTML/XML attribute pairs within a tag.
40 * Allows some... latitude.
41 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
42 */
43 $attrib = '[A-Za-z0-9]';
44 $space = '[\x09\x0a\x0d\x20]';
45 define( 'MW_ATTRIBS_REGEX',
46 "/(?:^|$space)($attrib+)
47 ($space*=$space*
48 (?:
49 # The attribute value: quoted or alone
50 \"([^<\"]*)\"
51 | '([^<']*)'
52 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
53 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
54 # colors are specified like this.
55 # We'll be normalizing it.
56 )
57 )?(?=$space|\$)/sx" );
58
59 /**
60 * List of all named character entities defined in HTML 4.01
61 * http://www.w3.org/TR/html4/sgml/entities.html
62 * @private
63 */
64 global $wgHtmlEntities;
65 $wgHtmlEntities = array(
66 'Aacute' => 193,
67 'aacute' => 225,
68 'Acirc' => 194,
69 'acirc' => 226,
70 'acute' => 180,
71 'AElig' => 198,
72 'aelig' => 230,
73 'Agrave' => 192,
74 'agrave' => 224,
75 'alefsym' => 8501,
76 'Alpha' => 913,
77 'alpha' => 945,
78 'amp' => 38,
79 'and' => 8743,
80 'ang' => 8736,
81 'Aring' => 197,
82 'aring' => 229,
83 'asymp' => 8776,
84 'Atilde' => 195,
85 'atilde' => 227,
86 'Auml' => 196,
87 'auml' => 228,
88 'bdquo' => 8222,
89 'Beta' => 914,
90 'beta' => 946,
91 'brvbar' => 166,
92 'bull' => 8226,
93 'cap' => 8745,
94 'Ccedil' => 199,
95 'ccedil' => 231,
96 'cedil' => 184,
97 'cent' => 162,
98 'Chi' => 935,
99 'chi' => 967,
100 'circ' => 710,
101 'clubs' => 9827,
102 'cong' => 8773,
103 'copy' => 169,
104 'crarr' => 8629,
105 'cup' => 8746,
106 'curren' => 164,
107 'dagger' => 8224,
108 'Dagger' => 8225,
109 'darr' => 8595,
110 'dArr' => 8659,
111 'deg' => 176,
112 'Delta' => 916,
113 'delta' => 948,
114 'diams' => 9830,
115 'divide' => 247,
116 'Eacute' => 201,
117 'eacute' => 233,
118 'Ecirc' => 202,
119 'ecirc' => 234,
120 'Egrave' => 200,
121 'egrave' => 232,
122 'empty' => 8709,
123 'emsp' => 8195,
124 'ensp' => 8194,
125 'Epsilon' => 917,
126 'epsilon' => 949,
127 'equiv' => 8801,
128 'Eta' => 919,
129 'eta' => 951,
130 'ETH' => 208,
131 'eth' => 240,
132 'Euml' => 203,
133 'euml' => 235,
134 'euro' => 8364,
135 'exist' => 8707,
136 'fnof' => 402,
137 'forall' => 8704,
138 'frac12' => 189,
139 'frac14' => 188,
140 'frac34' => 190,
141 'frasl' => 8260,
142 'Gamma' => 915,
143 'gamma' => 947,
144 'ge' => 8805,
145 'gt' => 62,
146 'harr' => 8596,
147 'hArr' => 8660,
148 'hearts' => 9829,
149 'hellip' => 8230,
150 'Iacute' => 205,
151 'iacute' => 237,
152 'Icirc' => 206,
153 'icirc' => 238,
154 'iexcl' => 161,
155 'Igrave' => 204,
156 'igrave' => 236,
157 'image' => 8465,
158 'infin' => 8734,
159 'int' => 8747,
160 'Iota' => 921,
161 'iota' => 953,
162 'iquest' => 191,
163 'isin' => 8712,
164 'Iuml' => 207,
165 'iuml' => 239,
166 'Kappa' => 922,
167 'kappa' => 954,
168 'Lambda' => 923,
169 'lambda' => 955,
170 'lang' => 9001,
171 'laquo' => 171,
172 'larr' => 8592,
173 'lArr' => 8656,
174 'lceil' => 8968,
175 'ldquo' => 8220,
176 'le' => 8804,
177 'lfloor' => 8970,
178 'lowast' => 8727,
179 'loz' => 9674,
180 'lrm' => 8206,
181 'lsaquo' => 8249,
182 'lsquo' => 8216,
183 'lt' => 60,
184 'macr' => 175,
185 'mdash' => 8212,
186 'micro' => 181,
187 'middot' => 183,
188 'minus' => 8722,
189 'Mu' => 924,
190 'mu' => 956,
191 'nabla' => 8711,
192 'nbsp' => 160,
193 'ndash' => 8211,
194 'ne' => 8800,
195 'ni' => 8715,
196 'not' => 172,
197 'notin' => 8713,
198 'nsub' => 8836,
199 'Ntilde' => 209,
200 'ntilde' => 241,
201 'Nu' => 925,
202 'nu' => 957,
203 'Oacute' => 211,
204 'oacute' => 243,
205 'Ocirc' => 212,
206 'ocirc' => 244,
207 'OElig' => 338,
208 'oelig' => 339,
209 'Ograve' => 210,
210 'ograve' => 242,
211 'oline' => 8254,
212 'Omega' => 937,
213 'omega' => 969,
214 'Omicron' => 927,
215 'omicron' => 959,
216 'oplus' => 8853,
217 'or' => 8744,
218 'ordf' => 170,
219 'ordm' => 186,
220 'Oslash' => 216,
221 'oslash' => 248,
222 'Otilde' => 213,
223 'otilde' => 245,
224 'otimes' => 8855,
225 'Ouml' => 214,
226 'ouml' => 246,
227 'para' => 182,
228 'part' => 8706,
229 'permil' => 8240,
230 'perp' => 8869,
231 'Phi' => 934,
232 'phi' => 966,
233 'Pi' => 928,
234 'pi' => 960,
235 'piv' => 982,
236 'plusmn' => 177,
237 'pound' => 163,
238 'prime' => 8242,
239 'Prime' => 8243,
240 'prod' => 8719,
241 'prop' => 8733,
242 'Psi' => 936,
243 'psi' => 968,
244 'quot' => 34,
245 'radic' => 8730,
246 'rang' => 9002,
247 'raquo' => 187,
248 'rarr' => 8594,
249 'rArr' => 8658,
250 'rceil' => 8969,
251 'rdquo' => 8221,
252 'real' => 8476,
253 'reg' => 174,
254 'rfloor' => 8971,
255 'Rho' => 929,
256 'rho' => 961,
257 'rlm' => 8207,
258 'rsaquo' => 8250,
259 'rsquo' => 8217,
260 'sbquo' => 8218,
261 'Scaron' => 352,
262 'scaron' => 353,
263 'sdot' => 8901,
264 'sect' => 167,
265 'shy' => 173,
266 'Sigma' => 931,
267 'sigma' => 963,
268 'sigmaf' => 962,
269 'sim' => 8764,
270 'spades' => 9824,
271 'sub' => 8834,
272 'sube' => 8838,
273 'sum' => 8721,
274 'sup' => 8835,
275 'sup1' => 185,
276 'sup2' => 178,
277 'sup3' => 179,
278 'supe' => 8839,
279 'szlig' => 223,
280 'Tau' => 932,
281 'tau' => 964,
282 'there4' => 8756,
283 'Theta' => 920,
284 'theta' => 952,
285 'thetasym' => 977,
286 'thinsp' => 8201,
287 'THORN' => 222,
288 'thorn' => 254,
289 'tilde' => 732,
290 'times' => 215,
291 'trade' => 8482,
292 'Uacute' => 218,
293 'uacute' => 250,
294 'uarr' => 8593,
295 'uArr' => 8657,
296 'Ucirc' => 219,
297 'ucirc' => 251,
298 'Ugrave' => 217,
299 'ugrave' => 249,
300 'uml' => 168,
301 'upsih' => 978,
302 'Upsilon' => 933,
303 'upsilon' => 965,
304 'Uuml' => 220,
305 'uuml' => 252,
306 'weierp' => 8472,
307 'Xi' => 926,
308 'xi' => 958,
309 'Yacute' => 221,
310 'yacute' => 253,
311 'yen' => 165,
312 'Yuml' => 376,
313 'yuml' => 255,
314 'Zeta' => 918,
315 'zeta' => 950,
316 'zwj' => 8205,
317 'zwnj' => 8204 );
318
319 /**
320 * Character entity aliases accepted by MediaWiki
321 */
322 global $wgHtmlEntityAliases;
323 $wgHtmlEntityAliases = array(
324 'רלמ' => 'rlm',
325 'رلم' => 'rlm',
326 );
327
328
329 /**
330 * XHTML sanitizer for MediaWiki
331 * @ingroup Parser
332 */
333 class Sanitizer {
334 /**
335 * Cleans up HTML, removes dangerous tags and attributes, and
336 * removes HTML comments
337 * @private
338 * @param $text String
339 * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
340 * @param $args Array for the processing callback
341 * @param $extratags Array for any extra tags to include
342 * @param $removetags Array for any tags (default or extra) to exclude
343 * @return string
344 */
345 static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
346 global $wgUseTidy;
347
348 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
349 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
350
351 wfProfileIn( __METHOD__ );
352
353 if ( !$staticInitialised ) {
354
355 $htmlpairsStatic = array( # Tags that must be closed
356 'a', 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
357 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
358 'strike', 'strong', 'tt', 'var', 'div', 'center',
359 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
360 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
361 );
362 $htmlsingle = array(
363 'br', 'hr', 'li', 'dt', 'dd'
364 );
365 $htmlsingleonly = array( # Elements that cannot have close tags
366 'br', 'hr'
367 );
368 $htmlnest = array( # Tags that can be nested--??
369 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
370 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
371 );
372 $tabletags = array( # Can only appear inside table, we will close them
373 'td', 'th', 'tr',
374 );
375 $htmllist = array( # Tags used by list
376 'ul','ol',
377 );
378 $listtags = array( # Tags that can appear in a list
379 'li',
380 );
381
382 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
383 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
384
385 # Convert them all to hashtables for faster lookup
386 $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
387 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
388 foreach ( $vars as $var ) {
389 $$var = array_flip( $$var );
390 }
391 $staticInitialised = true;
392 }
393 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
394 $extratags = array_flip( $extratags );
395 $removetags = array_flip( $removetags );
396 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
397 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
398
399 # Remove HTML comments
400 $text = Sanitizer::removeHTMLcomments( $text );
401 $bits = explode( '<', $text );
402 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
403 if(!$wgUseTidy) {
404 $tagstack = $tablestack = array();
405 foreach ( $bits as $x ) {
406 $regs = array();
407 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
408 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
409 } else {
410 $slash = $t = $params = $brace = $rest = null;
411 }
412
413 $badtag = 0 ;
414 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
415 # Check our stack
416 if ( $slash ) {
417 # Closing a tag...
418 if( isset( $htmlsingleonly[$t] ) ) {
419 $badtag = 1;
420 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
421 if ( isset( $htmlsingleallowed[$ot] ) ) {
422 # Pop all elements with an optional close tag
423 # and see if we find a match below them
424 $optstack = array();
425 array_push ($optstack, $ot);
426 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
427 isset( $htmlsingleallowed[$ot] ) )
428 {
429 array_push ($optstack, $ot);
430 }
431 if ( $t != $ot ) {
432 # No match. Push the optinal elements back again
433 $badtag = 1;
434 while ( $ot = @array_pop( $optstack ) ) {
435 array_push( $tagstack, $ot );
436 }
437 }
438 } else {
439 @array_push( $tagstack, $ot );
440 # <li> can be nested in <ul> or <ol>, skip those cases:
441 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
442 $badtag = 1;
443 }
444 }
445 } else {
446 if ( $t == 'table' ) {
447 $tagstack = array_pop( $tablestack );
448 }
449 }
450 $newparams = '';
451 } else {
452 # Keep track for later
453 if ( isset( $tabletags[$t] ) &&
454 ! in_array( 'table', $tagstack ) ) {
455 $badtag = 1;
456 } else if ( in_array( $t, $tagstack ) &&
457 ! isset( $htmlnest [$t ] ) ) {
458 $badtag = 1 ;
459 # Is it a self closed htmlpair ? (bug 5487)
460 } else if( $brace == '/>' &&
461 isset( $htmlpairs[$t] ) ) {
462 $badtag = 1;
463 } elseif( isset( $htmlsingleonly[$t] ) ) {
464 # Hack to force empty tag for uncloseable elements
465 $brace = '/>';
466 } else if( isset( $htmlsingle[$t] ) ) {
467 # Hack to not close $htmlsingle tags
468 $brace = NULL;
469 } else if( isset( $tabletags[$t] )
470 && in_array($t ,$tagstack) ) {
471 // New table tag but forgot to close the previous one
472 $text .= "</$t>";
473 } else {
474 if ( $t == 'table' ) {
475 array_push( $tablestack, $tagstack );
476 $tagstack = array();
477 }
478 array_push( $tagstack, $t );
479 }
480
481 # Replace any variables or template parameters with
482 # plaintext results.
483 if( is_callable( $processCallback ) ) {
484 call_user_func_array( $processCallback, array( &$params, $args ) );
485 }
486
487 # Strip non-approved attributes from the tag
488 $newparams = Sanitizer::fixTagAttributes( $params, $t );
489 }
490 if ( ! $badtag ) {
491 $rest = str_replace( '>', '&gt;', $rest );
492 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
493 $text .= "<$slash$t$newparams$close>$rest";
494 continue;
495 }
496 }
497 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
498 }
499 # Close off any remaining tags
500 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
501 $text .= "</$t>\n";
502 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
503 }
504 } else {
505 # this might be possible using tidy itself
506 foreach ( $bits as $x ) {
507 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
508 $x, $regs );
509 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
510 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
511 if( is_callable( $processCallback ) ) {
512 call_user_func_array( $processCallback, array( &$params, $args ) );
513 }
514 $newparams = Sanitizer::fixTagAttributes( $params, $t );
515 $rest = str_replace( '>', '&gt;', $rest );
516 $text .= "<$slash$t$newparams$brace$rest";
517 } else {
518 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
519 }
520 }
521 }
522 wfProfileOut( __METHOD__ );
523 return $text;
524 }
525
526 /**
527 * Remove '<!--', '-->', and everything between.
528 * To avoid leaving blank lines, when a comment is both preceded
529 * and followed by a newline (ignoring spaces), trim leading and
530 * trailing spaces and one of the newlines.
531 *
532 * @private
533 * @param $text String
534 * @return string
535 */
536 static function removeHTMLcomments( $text ) {
537 wfProfileIn( __METHOD__ );
538 while (($start = strpos($text, '<!--')) !== false) {
539 $end = strpos($text, '-->', $start + 4);
540 if ($end === false) {
541 # Unterminated comment; bail out
542 break;
543 }
544
545 $end += 3;
546
547 # Trim space and newline if the comment is both
548 # preceded and followed by a newline
549 $spaceStart = max($start - 1, 0);
550 $spaceLen = $end - $spaceStart;
551 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
552 $spaceStart--;
553 $spaceLen++;
554 }
555 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
556 $spaceLen++;
557 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
558 # Remove the comment, leading and trailing
559 # spaces, and leave only one newline.
560 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
561 }
562 else {
563 # Remove just the comment.
564 $text = substr_replace($text, '', $start, $end - $start);
565 }
566 }
567 wfProfileOut( __METHOD__ );
568 return $text;
569 }
570
571 /**
572 * Take an array of attribute names and values and normalize or discard
573 * illegal values for the given element type.
574 *
575 * - Discards attributes not on a whitelist for the given element
576 * - Unsafe style attributes are discarded
577 * - Invalid id attributes are reencoded
578 *
579 * @param $attribs Array
580 * @param $element String
581 * @return Array
582 *
583 * @todo Check for legal values where the DTD limits things.
584 * @todo Check for unique id attribute :P
585 */
586 static function validateTagAttributes( $attribs, $element ) {
587 return Sanitizer::validateAttributes( $attribs,
588 Sanitizer::attributeWhitelist( $element ) );
589 }
590
591 /**
592 * Take an array of attribute names and values and normalize or discard
593 * illegal values for the given whitelist.
594 *
595 * - Discards attributes not the given whitelist
596 * - Unsafe style attributes are discarded
597 * - Invalid id attributes are reencoded
598 *
599 * @param $attribs Array
600 * @param $whitelist Array: list of allowed attribute names
601 * @return Array
602 *
603 * @todo Check for legal values where the DTD limits things.
604 * @todo Check for unique id attribute :P
605 */
606 static function validateAttributes( $attribs, $whitelist ) {
607 $whitelist = array_flip( $whitelist );
608 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
609
610 $out = array();
611 foreach( $attribs as $attribute => $value ) {
612 if( !isset( $whitelist[$attribute] ) ) {
613 continue;
614 }
615 # Strip javascript "expression" from stylesheets.
616 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
617 if( $attribute == 'style' ) {
618 $value = Sanitizer::checkCss( $value );
619 if( $value === false ) {
620 # haxx0r
621 continue;
622 }
623 }
624
625 if ( $attribute === 'id' ) {
626 global $wgEnforceHtmlIds;
627 $value = Sanitizer::escapeId( $value,
628 $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
629 }
630
631 if ( $attribute === 'href' || $attribute === 'src' ) {
632 if ( !preg_match( $hrefExp, $value ) ) {
633 continue; //drop any href or src attributes not using an allowed protocol.
634 //NOTE: this also drops all relative URLs
635 }
636 }
637
638 //RDFa properties allow URIs. check them
639 if ( $attribute === 'rel' || $attribute === 'rev' ||
640 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' ||
641 $attribute === 'datatype' || $attribute === 'typeof' ) {
642 //Paranoia. Allow "simple" values but suppress javascript
643 if ( preg_match( '/(^|\s)javascript\s*:/i', $value ) ) {
644 continue;
645 }
646 }
647
648 // If this attribute was previously set, override it.
649 // Output should only have one attribute of each name.
650 $out[$attribute] = $value;
651 }
652 return $out;
653 }
654
655 /**
656 * Merge two sets of HTML attributes. Conflicting items in the second set
657 * will override those in the first, except for 'class' attributes which
658 * will be combined (if they're both strings).
659 *
660 * @todo implement merging for other attributes such as style
661 * @param $a Array
662 * @param $b Array
663 * @return array
664 */
665 static function mergeAttributes( $a, $b ) {
666 $out = array_merge( $a, $b );
667 if( isset( $a['class'] ) && isset( $b['class'] )
668 && is_string( $a['class'] ) && is_string( $b['class'] )
669 && $a['class'] !== $b['class'] ) {
670 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
671 -1, PREG_SPLIT_NO_EMPTY );
672 $out['class'] = implode( ' ', array_unique( $classes ) );
673 }
674 return $out;
675 }
676
677 /**
678 * Pick apart some CSS and check it for forbidden or unsafe structures.
679 * Returns a sanitized string, or false if it was just too evil.
680 *
681 * Currently URL references, 'expression', 'tps' are forbidden.
682 *
683 * @param $value String
684 * @return Mixed
685 */
686 static function checkCss( $value ) {
687 $stripped = Sanitizer::decodeCharReferences( $value );
688
689 // Remove any comments; IE gets token splitting wrong
690 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
691
692 $value = $stripped;
693
694 // ... and continue checks
695 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
696 'codepointToUtf8(hexdec("$1"))', $stripped );
697 $stripped = str_replace( '\\', '', $stripped );
698 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
699 $stripped ) ) {
700 # haxx0r
701 return false;
702 }
703
704 return $value;
705 }
706
707 /**
708 * Take a tag soup fragment listing an HTML element's attributes
709 * and normalize it to well-formed XML, discarding unwanted attributes.
710 * Output is safe for further wikitext processing, with escaping of
711 * values that could trigger problems.
712 *
713 * - Normalizes attribute names to lowercase
714 * - Discards attributes not on a whitelist for the given element
715 * - Turns broken or invalid entities into plaintext
716 * - Double-quotes all attribute values
717 * - Attributes without values are given the name as attribute
718 * - Double attributes are discarded
719 * - Unsafe style attributes are discarded
720 * - Prepends space if there are attributes.
721 *
722 * @param $text String
723 * @param $element String
724 * @return String
725 */
726 static function fixTagAttributes( $text, $element ) {
727 if( trim( $text ) == '' ) {
728 return '';
729 }
730
731 $stripped = Sanitizer::validateTagAttributes(
732 Sanitizer::decodeTagAttributes( $text ), $element );
733
734 $attribs = array();
735 foreach( $stripped as $attribute => $value ) {
736 $encAttribute = htmlspecialchars( $attribute );
737 $encValue = Sanitizer::safeEncodeAttribute( $value );
738
739 $attribs[] = "$encAttribute=\"$encValue\"";
740 }
741 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
742 }
743
744 /**
745 * Encode an attribute value for HTML output.
746 * @param $text String
747 * @return HTML-encoded text fragment
748 */
749 static function encodeAttribute( $text ) {
750 $encValue = htmlspecialchars( $text, ENT_QUOTES );
751
752 // Whitespace is normalized during attribute decoding,
753 // so if we've been passed non-spaces we must encode them
754 // ahead of time or they won't be preserved.
755 $encValue = strtr( $encValue, array(
756 "\n" => '&#10;',
757 "\r" => '&#13;',
758 "\t" => '&#9;',
759 ) );
760
761 return $encValue;
762 }
763
764 /**
765 * Encode an attribute value for HTML tags, with extra armoring
766 * against further wiki processing.
767 * @param $text String
768 * @return HTML-encoded text fragment
769 */
770 static function safeEncodeAttribute( $text ) {
771 $encValue = Sanitizer::encodeAttribute( $text );
772
773 # Templates and links may be expanded in later parsing,
774 # creating invalid or dangerous output. Suppress this.
775 $encValue = strtr( $encValue, array(
776 '<' => '&lt;', // This should never happen,
777 '>' => '&gt;', // we've received invalid input
778 '"' => '&quot;', // which should have been escaped.
779 '{' => '&#123;',
780 '[' => '&#91;',
781 "''" => '&#39;&#39;',
782 'ISBN' => '&#73;SBN',
783 'RFC' => '&#82;FC',
784 'PMID' => '&#80;MID',
785 '|' => '&#124;',
786 '__' => '&#95;_',
787 ) );
788
789 # Stupid hack
790 $encValue = preg_replace_callback(
791 '/(' . wfUrlProtocols() . ')/',
792 array( 'Sanitizer', 'armorLinksCallback' ),
793 $encValue );
794 return $encValue;
795 }
796
797 /**
798 * Given a value escape it so that it can be used in an id attribute and
799 * return it, this does not validate the value however (see first link)
800 *
801 * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
802 * in the id and
803 * name attributes
804 * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
805 *
806 * @param $id String: id to validate
807 * @param $options Mixed: string or array of strings (default is array()):
808 * 'noninitial': This is a non-initial fragment of an id, not a full id,
809 * so don't pay attention if the first character isn't valid at the
810 * beginning of an id.
811 * 'xml': Don't restrict the id to be HTML4-compatible. This option
812 * allows any alphabetic character to be used, per the XML standard.
813 * Therefore, it also completely changes the type of escaping: instead
814 * of weird dot-encoding, runs of invalid characters (mostly
815 * whitespace) are just compressed into a single underscore.
816 * @return String
817 */
818 static function escapeId( $id, $options = array() ) {
819 $options = (array)$options;
820
821 if ( !in_array( 'xml', $options ) ) {
822 # HTML4-style escaping
823 static $replace = array(
824 '%3A' => ':',
825 '%' => '.'
826 );
827
828 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
829 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
830
831 if ( !preg_match( '/^[a-zA-Z]/', $id )
832 && !in_array( 'noninitial', $options ) ) {
833 // Initial character must be a letter!
834 $id = "x$id";
835 }
836 return $id;
837 }
838
839 # XML-style escaping. For the patterns used, see the XML 1.0 standard,
840 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
841 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
842 . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
843 . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
844 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
845 . '\x{203F}-\x{2040}';
846 # Replace _ as well so we don't get multiple consecutive underscores
847 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
848 $id = trim( $id, '_' );
849
850 if ( !preg_match( "/^[$nameStartChar]/u", $id )
851 && !in_array( 'noninitial', $options ) ) {
852 $id = "_$id";
853 }
854
855 return $id;
856 }
857
858 /**
859 * Given a value, escape it so that it can be used as a CSS class and
860 * return it.
861 *
862 * @todo For extra validity, input should be validated UTF-8.
863 *
864 * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
865 *
866 * @param $class String
867 * @return String
868 */
869 static function escapeClass( $class ) {
870 // Convert ugly stuff to underscores and kill underscores in ugly places
871 return rtrim(preg_replace(
872 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
873 '_',
874 $class ), '_');
875 }
876
877 /**
878 * Given HTML input, escape with htmlspecialchars but un-escape entites.
879 * This allows (generally harmless) entities like &nbsp; to survive.
880 *
881 * @param $html String to escape
882 * @return String: escaped input
883 */
884 static function escapeHtmlAllowEntities( $html ) {
885 # It seems wise to escape ' as well as ", as a matter of course. Can't
886 # hurt.
887 $html = htmlspecialchars( $html, ENT_QUOTES );
888 $html = str_replace( '&amp;', '&', $html );
889 $html = Sanitizer::normalizeCharReferences( $html );
890 return $html;
891 }
892
893 /**
894 * Regex replace callback for armoring links against further processing.
895 * @param $matches Array
896 * @return string
897 */
898 private static function armorLinksCallback( $matches ) {
899 return str_replace( ':', '&#58;', $matches[1] );
900 }
901
902 /**
903 * Return an associative array of attribute names and values from
904 * a partial tag string. Attribute names are forces to lowercase,
905 * character references are decoded to UTF-8 text.
906 *
907 * @param $text String
908 * @return Array
909 */
910 public static function decodeTagAttributes( $text ) {
911 $attribs = array();
912
913 if( trim( $text ) == '' ) {
914 return $attribs;
915 }
916
917 $pairs = array();
918 if( !preg_match_all(
919 MW_ATTRIBS_REGEX,
920 $text,
921 $pairs,
922 PREG_SET_ORDER ) ) {
923 return $attribs;
924 }
925
926 foreach( $pairs as $set ) {
927 $attribute = strtolower( $set[1] );
928 $value = Sanitizer::getTagAttributeCallback( $set );
929
930 // Normalize whitespace
931 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
932 $value = trim( $value );
933
934 // Decode character references
935 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
936 }
937 return $attribs;
938 }
939
940 /**
941 * Pick the appropriate attribute value from a match set from the
942 * MW_ATTRIBS_REGEX matches.
943 *
944 * @param $set Array
945 * @return String
946 */
947 private static function getTagAttributeCallback( $set ) {
948 if( isset( $set[6] ) ) {
949 # Illegal #XXXXXX color with no quotes.
950 return $set[6];
951 } elseif( isset( $set[5] ) ) {
952 # No quotes.
953 return $set[5];
954 } elseif( isset( $set[4] ) ) {
955 # Single-quoted
956 return $set[4];
957 } elseif( isset( $set[3] ) ) {
958 # Double-quoted
959 return $set[3];
960 } elseif( !isset( $set[2] ) ) {
961 # In XHTML, attributes must have a value.
962 # For 'reduced' form, return explicitly the attribute name here.
963 return $set[1];
964 } else {
965 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
966 }
967 }
968
969 /**
970 * Normalize whitespace and character references in an XML source-
971 * encoded text for an attribute value.
972 *
973 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
974 * but note that we're not returning the value, but are returning
975 * XML source fragments that will be slapped into output.
976 *
977 * @param $text String
978 * @return String
979 */
980 private static function normalizeAttributeValue( $text ) {
981 return str_replace( '"', '&quot;',
982 self::normalizeWhitespace(
983 Sanitizer::normalizeCharReferences( $text ) ) );
984 }
985
986 private static function normalizeWhitespace( $text ) {
987 return preg_replace(
988 '/\r\n|[\x20\x0d\x0a\x09]/',
989 ' ',
990 $text );
991 }
992
993 /**
994 * Ensure that any entities and character references are legal
995 * for XML and XHTML specifically. Any stray bits will be
996 * &amp;-escaped to result in a valid text fragment.
997 *
998 * a. any named char refs must be known in XHTML
999 * b. any numeric char refs must be legal chars, not invalid or forbidden
1000 * c. use &#x, not &#X
1001 * d. fix or reject non-valid attributes
1002 *
1003 * @param $text String
1004 * @return String
1005 * @private
1006 */
1007 static function normalizeCharReferences( $text ) {
1008 return preg_replace_callback(
1009 MW_CHAR_REFS_REGEX,
1010 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1011 $text );
1012 }
1013 /**
1014 * @param $matches String
1015 * @return String
1016 */
1017 static function normalizeCharReferencesCallback( $matches ) {
1018 $ret = null;
1019 if( $matches[1] != '' ) {
1020 $ret = Sanitizer::normalizeEntity( $matches[1] );
1021 } elseif( $matches[2] != '' ) {
1022 $ret = Sanitizer::decCharReference( $matches[2] );
1023 } elseif( $matches[3] != '' ) {
1024 $ret = Sanitizer::hexCharReference( $matches[3] );
1025 } elseif( $matches[4] != '' ) {
1026 $ret = Sanitizer::hexCharReference( $matches[4] );
1027 }
1028 if( is_null( $ret ) ) {
1029 return htmlspecialchars( $matches[0] );
1030 } else {
1031 return $ret;
1032 }
1033 }
1034
1035 /**
1036 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1037 * return the named entity reference as is. If the entity is a
1038 * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
1039 * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
1040 *
1041 * @param $name String
1042 * @return String
1043 */
1044 static function normalizeEntity( $name ) {
1045 global $wgHtmlEntities, $wgHtmlEntityAliases;
1046 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1047 return "&{$wgHtmlEntityAliases[$name]};";
1048 } elseif( isset( $wgHtmlEntities[$name] ) ) {
1049 return "&$name;";
1050 } else {
1051 return "&amp;$name;";
1052 }
1053 }
1054
1055 static function decCharReference( $codepoint ) {
1056 $point = intval( $codepoint );
1057 if( Sanitizer::validateCodepoint( $point ) ) {
1058 return sprintf( '&#%d;', $point );
1059 } else {
1060 return null;
1061 }
1062 }
1063
1064 static function hexCharReference( $codepoint ) {
1065 $point = hexdec( $codepoint );
1066 if( Sanitizer::validateCodepoint( $point ) ) {
1067 return sprintf( '&#x%x;', $point );
1068 } else {
1069 return null;
1070 }
1071 }
1072
1073 /**
1074 * Returns true if a given Unicode codepoint is a valid character in XML.
1075 * @param $codepoint Integer
1076 * @return Boolean
1077 */
1078 private static function validateCodepoint( $codepoint ) {
1079 return ($codepoint == 0x09)
1080 || ($codepoint == 0x0a)
1081 || ($codepoint == 0x0d)
1082 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
1083 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
1084 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1085 }
1086
1087 /**
1088 * Decode any character references, numeric or named entities,
1089 * in the text and return a UTF-8 string.
1090 *
1091 * @param $text String
1092 * @return String
1093 */
1094 public static function decodeCharReferences( $text ) {
1095 return preg_replace_callback(
1096 MW_CHAR_REFS_REGEX,
1097 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1098 $text );
1099 }
1100
1101 /**
1102 * @param $matches String
1103 * @return String
1104 */
1105 static function decodeCharReferencesCallback( $matches ) {
1106 if( $matches[1] != '' ) {
1107 return Sanitizer::decodeEntity( $matches[1] );
1108 } elseif( $matches[2] != '' ) {
1109 return Sanitizer::decodeChar( intval( $matches[2] ) );
1110 } elseif( $matches[3] != '' ) {
1111 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
1112 } elseif( $matches[4] != '' ) {
1113 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
1114 }
1115 # Last case should be an ampersand by itself
1116 return $matches[0];
1117 }
1118
1119 /**
1120 * Return UTF-8 string for a codepoint if that is a valid
1121 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1122 * @param $codepoint Integer
1123 * @return String
1124 * @private
1125 */
1126 static function decodeChar( $codepoint ) {
1127 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1128 return codepointToUtf8( $codepoint );
1129 } else {
1130 return UTF8_REPLACEMENT;
1131 }
1132 }
1133
1134 /**
1135 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1136 * return the UTF-8 encoding of that character. Otherwise, returns
1137 * pseudo-entity source (eg &foo;)
1138 *
1139 * @param $name Strings
1140 * @return String
1141 */
1142 static function decodeEntity( $name ) {
1143 global $wgHtmlEntities, $wgHtmlEntityAliases;
1144 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1145 $name = $wgHtmlEntityAliases[$name];
1146 }
1147 if( isset( $wgHtmlEntities[$name] ) ) {
1148 return codepointToUtf8( $wgHtmlEntities[$name] );
1149 } else {
1150 return "&$name;";
1151 }
1152 }
1153
1154 /**
1155 * Fetch the whitelist of acceptable attributes for a given element name.
1156 *
1157 * @param $element String
1158 * @return Array
1159 */
1160 static function attributeWhitelist( $element ) {
1161 static $list;
1162 if( !isset( $list ) ) {
1163 $list = Sanitizer::setupAttributeWhitelist();
1164 }
1165 return isset( $list[$element] )
1166 ? $list[$element]
1167 : array();
1168 }
1169
1170 /**
1171 * Foreach array key (an allowed HTML element), return an array
1172 * of allowed attributes
1173 * @return Array
1174 */
1175 static function setupAttributeWhitelist() {
1176 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style',
1177 #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1178 'about', 'property', 'resource', 'datatype', 'typeof',
1179 );
1180
1181 $block = array_merge( $common, array( 'align' ) );
1182 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1183 $tablecell = array( 'abbr',
1184 'axis',
1185 'headers',
1186 'scope',
1187 'rowspan',
1188 'colspan',
1189 'nowrap', # deprecated
1190 'width', # deprecated
1191 'height', # deprecated
1192 'bgcolor' # deprecated
1193 );
1194
1195 # Numbers refer to sections in HTML 4.01 standard describing the element.
1196 # See: http://www.w3.org/TR/html4/
1197 $whitelist = array (
1198 # 7.5.4
1199 'div' => $block,
1200 'center' => $common, # deprecated
1201 'span' => $block, # ??
1202
1203 # 7.5.5
1204 'h1' => $block,
1205 'h2' => $block,
1206 'h3' => $block,
1207 'h4' => $block,
1208 'h5' => $block,
1209 'h6' => $block,
1210
1211 # 7.5.6
1212 # address
1213
1214 # 8.2.4
1215 # bdo
1216
1217 # 9.2.1
1218 'em' => $common,
1219 'strong' => $common,
1220 'cite' => $common,
1221 # dfn
1222 'code' => $common,
1223 # samp
1224 # kbd
1225 'var' => $common,
1226 'abbr' => $common,
1227 # acronym
1228
1229 # 9.2.2
1230 'blockquote' => array_merge( $common, array( 'cite' ) ),
1231 # q
1232
1233 # 9.2.3
1234 'sub' => $common,
1235 'sup' => $common,
1236
1237 # 9.3.1
1238 'p' => $block,
1239
1240 # 9.3.2
1241 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
1242
1243 # 9.3.4
1244 'pre' => array_merge( $common, array( 'width' ) ),
1245
1246 # 9.4
1247 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1248 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1249
1250 # 10.2
1251 'ul' => array_merge( $common, array( 'type' ) ),
1252 'ol' => array_merge( $common, array( 'type', 'start' ) ),
1253 'li' => array_merge( $common, array( 'type', 'value' ) ),
1254
1255 # 10.3
1256 'dl' => $common,
1257 'dd' => $common,
1258 'dt' => $common,
1259
1260 # 11.2.1
1261 'table' => array_merge( $common,
1262 array( 'summary', 'width', 'border', 'frame',
1263 'rules', 'cellspacing', 'cellpadding',
1264 'align', 'bgcolor',
1265 ) ),
1266
1267 # 11.2.2
1268 'caption' => array_merge( $common, array( 'align' ) ),
1269
1270 # 11.2.3
1271 'thead' => array_merge( $common, $tablealign ),
1272 'tfoot' => array_merge( $common, $tablealign ),
1273 'tbody' => array_merge( $common, $tablealign ),
1274
1275 # 11.2.4
1276 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1277 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1278
1279 # 11.2.5
1280 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1281
1282 # 11.2.6
1283 'td' => array_merge( $common, $tablecell, $tablealign ),
1284 'th' => array_merge( $common, $tablecell, $tablealign ),
1285
1286 # 12.2
1287 'a' => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
1288
1289 # 13.2
1290 # Not usually allowed, but may be used for extension-style hooks
1291 # such as <math> when it is rasterized
1292 'img' => array_merge( $common, array( 'alt' ) ),
1293
1294 # 15.2.1
1295 'tt' => $common,
1296 'b' => $common,
1297 'i' => $common,
1298 'big' => $common,
1299 'small' => $common,
1300 'strike' => $common,
1301 's' => $common,
1302 'u' => $common,
1303
1304 # 15.2.2
1305 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1306 # basefont
1307
1308 # 15.3
1309 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1310
1311 # XHTML Ruby annotation text module, simple ruby only.
1312 # http://www.w3c.org/TR/ruby/
1313 'ruby' => $common,
1314 # rbc
1315 # rtc
1316 'rb' => $common,
1317 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1318 'rp' => $common,
1319
1320 # MathML root element, where used for extensions
1321 # 'title' may not be 100% valid here; it's XHTML
1322 # http://www.w3.org/TR/REC-MathML/
1323 'math' => array( 'class', 'style', 'id', 'title' ),
1324 );
1325 return $whitelist;
1326 }
1327
1328 /**
1329 * Take a fragment of (potentially invalid) HTML and return
1330 * a version with any tags removed, encoded as plain text.
1331 *
1332 * Warning: this return value must be further escaped for literal
1333 * inclusion in HTML output as of 1.10!
1334 *
1335 * @param $text String: HTML fragment
1336 * @return String
1337 */
1338 static function stripAllTags( $text ) {
1339 # Actual <tags>
1340 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1341
1342 # Normalize &entities and whitespace
1343 $text = self::decodeCharReferences( $text );
1344 $text = self::normalizeWhitespace( $text );
1345
1346 return $text;
1347 }
1348
1349 /**
1350 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1351 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1352 * PHP 5.1 doesn't.
1353 *
1354 * Use for passing XHTML fragments to PHP's XML parsing functions
1355 *
1356 * @return String
1357 */
1358 static function hackDocType() {
1359 global $wgHtmlEntities;
1360 $out = "<!DOCTYPE html [\n";
1361 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1362 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1363 }
1364 $out .= "]>\n";
1365 return $out;
1366 }
1367
1368 static function cleanUrl( $url ) {
1369 # Normalize any HTML entities in input. They will be
1370 # re-escaped by makeExternalLink().
1371 $url = Sanitizer::decodeCharReferences( $url );
1372
1373 # Escape any control characters introduced by the above step
1374 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1375
1376 # Validate hostname portion
1377 $matches = array();
1378 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1379 list( /* $whole */, $protocol, $host, $rest ) = $matches;
1380
1381 // Characters that will be ignored in IDNs.
1382 // http://tools.ietf.org/html/3454#section-3.1
1383 // Strip them before further processing so blacklists and such work.
1384 $strip = "/
1385 \\s| # general whitespace
1386 \xc2\xad| # 00ad SOFT HYPHEN
1387 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1388 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1389 \xe2\x81\xa0| # 2060 WORD JOINER
1390 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1391 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1392 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1393 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1394 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1395 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1396 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1397 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1398 /xuD";
1399
1400 $host = preg_replace( $strip, '', $host );
1401
1402 // @fixme: validate hostnames here
1403
1404 return $protocol . $host . $rest;
1405 } else {
1406 return $url;
1407 }
1408 }
1409
1410 }