allow RDFa attributes; missing support for <a>, will be added as a parser tag hook
[lhc/web/wiklou.git] / includes / Sanitizer.php
1 <?php
2 /**
3 * XHTML sanitizer for MediaWiki
4 *
5 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * http://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Parser
25 */
26
27 /**
28 * Regular expression to match various types of character references in
29 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
30 */
31 define( 'MW_CHAR_REFS_REGEX',
32 '/&([A-Za-z0-9\x80-\xff]+);
33 |&\#([0-9]+);
34 |&\#x([0-9A-Za-z]+);
35 |&\#X([0-9A-Za-z]+);
36 |(&)/x' );
37
38 /**
39 * Regular expression to match HTML/XML attribute pairs within a tag.
40 * Allows some... latitude.
41 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
42 */
43 $attrib = '[A-Za-z0-9]';
44 $space = '[\x09\x0a\x0d\x20]';
45 define( 'MW_ATTRIBS_REGEX',
46 "/(?:^|$space)($attrib+)
47 ($space*=$space*
48 (?:
49 # The attribute value: quoted or alone
50 \"([^<\"]*)\"
51 | '([^<']*)'
52 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
53 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
54 # colors are specified like this.
55 # We'll be normalizing it.
56 )
57 )?(?=$space|\$)/sx" );
58
59 /**
60 * List of all named character entities defined in HTML 4.01
61 * http://www.w3.org/TR/html4/sgml/entities.html
62 * @private
63 */
64 global $wgHtmlEntities;
65 $wgHtmlEntities = array(
66 'Aacute' => 193,
67 'aacute' => 225,
68 'Acirc' => 194,
69 'acirc' => 226,
70 'acute' => 180,
71 'AElig' => 198,
72 'aelig' => 230,
73 'Agrave' => 192,
74 'agrave' => 224,
75 'alefsym' => 8501,
76 'Alpha' => 913,
77 'alpha' => 945,
78 'amp' => 38,
79 'and' => 8743,
80 'ang' => 8736,
81 'Aring' => 197,
82 'aring' => 229,
83 'asymp' => 8776,
84 'Atilde' => 195,
85 'atilde' => 227,
86 'Auml' => 196,
87 'auml' => 228,
88 'bdquo' => 8222,
89 'Beta' => 914,
90 'beta' => 946,
91 'brvbar' => 166,
92 'bull' => 8226,
93 'cap' => 8745,
94 'Ccedil' => 199,
95 'ccedil' => 231,
96 'cedil' => 184,
97 'cent' => 162,
98 'Chi' => 935,
99 'chi' => 967,
100 'circ' => 710,
101 'clubs' => 9827,
102 'cong' => 8773,
103 'copy' => 169,
104 'crarr' => 8629,
105 'cup' => 8746,
106 'curren' => 164,
107 'dagger' => 8224,
108 'Dagger' => 8225,
109 'darr' => 8595,
110 'dArr' => 8659,
111 'deg' => 176,
112 'Delta' => 916,
113 'delta' => 948,
114 'diams' => 9830,
115 'divide' => 247,
116 'Eacute' => 201,
117 'eacute' => 233,
118 'Ecirc' => 202,
119 'ecirc' => 234,
120 'Egrave' => 200,
121 'egrave' => 232,
122 'empty' => 8709,
123 'emsp' => 8195,
124 'ensp' => 8194,
125 'Epsilon' => 917,
126 'epsilon' => 949,
127 'equiv' => 8801,
128 'Eta' => 919,
129 'eta' => 951,
130 'ETH' => 208,
131 'eth' => 240,
132 'Euml' => 203,
133 'euml' => 235,
134 'euro' => 8364,
135 'exist' => 8707,
136 'fnof' => 402,
137 'forall' => 8704,
138 'frac12' => 189,
139 'frac14' => 188,
140 'frac34' => 190,
141 'frasl' => 8260,
142 'Gamma' => 915,
143 'gamma' => 947,
144 'ge' => 8805,
145 'gt' => 62,
146 'harr' => 8596,
147 'hArr' => 8660,
148 'hearts' => 9829,
149 'hellip' => 8230,
150 'Iacute' => 205,
151 'iacute' => 237,
152 'Icirc' => 206,
153 'icirc' => 238,
154 'iexcl' => 161,
155 'Igrave' => 204,
156 'igrave' => 236,
157 'image' => 8465,
158 'infin' => 8734,
159 'int' => 8747,
160 'Iota' => 921,
161 'iota' => 953,
162 'iquest' => 191,
163 'isin' => 8712,
164 'Iuml' => 207,
165 'iuml' => 239,
166 'Kappa' => 922,
167 'kappa' => 954,
168 'Lambda' => 923,
169 'lambda' => 955,
170 'lang' => 9001,
171 'laquo' => 171,
172 'larr' => 8592,
173 'lArr' => 8656,
174 'lceil' => 8968,
175 'ldquo' => 8220,
176 'le' => 8804,
177 'lfloor' => 8970,
178 'lowast' => 8727,
179 'loz' => 9674,
180 'lrm' => 8206,
181 'lsaquo' => 8249,
182 'lsquo' => 8216,
183 'lt' => 60,
184 'macr' => 175,
185 'mdash' => 8212,
186 'micro' => 181,
187 'middot' => 183,
188 'minus' => 8722,
189 'Mu' => 924,
190 'mu' => 956,
191 'nabla' => 8711,
192 'nbsp' => 160,
193 'ndash' => 8211,
194 'ne' => 8800,
195 'ni' => 8715,
196 'not' => 172,
197 'notin' => 8713,
198 'nsub' => 8836,
199 'Ntilde' => 209,
200 'ntilde' => 241,
201 'Nu' => 925,
202 'nu' => 957,
203 'Oacute' => 211,
204 'oacute' => 243,
205 'Ocirc' => 212,
206 'ocirc' => 244,
207 'OElig' => 338,
208 'oelig' => 339,
209 'Ograve' => 210,
210 'ograve' => 242,
211 'oline' => 8254,
212 'Omega' => 937,
213 'omega' => 969,
214 'Omicron' => 927,
215 'omicron' => 959,
216 'oplus' => 8853,
217 'or' => 8744,
218 'ordf' => 170,
219 'ordm' => 186,
220 'Oslash' => 216,
221 'oslash' => 248,
222 'Otilde' => 213,
223 'otilde' => 245,
224 'otimes' => 8855,
225 'Ouml' => 214,
226 'ouml' => 246,
227 'para' => 182,
228 'part' => 8706,
229 'permil' => 8240,
230 'perp' => 8869,
231 'Phi' => 934,
232 'phi' => 966,
233 'Pi' => 928,
234 'pi' => 960,
235 'piv' => 982,
236 'plusmn' => 177,
237 'pound' => 163,
238 'prime' => 8242,
239 'Prime' => 8243,
240 'prod' => 8719,
241 'prop' => 8733,
242 'Psi' => 936,
243 'psi' => 968,
244 'quot' => 34,
245 'radic' => 8730,
246 'rang' => 9002,
247 'raquo' => 187,
248 'rarr' => 8594,
249 'rArr' => 8658,
250 'rceil' => 8969,
251 'rdquo' => 8221,
252 'real' => 8476,
253 'reg' => 174,
254 'rfloor' => 8971,
255 'Rho' => 929,
256 'rho' => 961,
257 'rlm' => 8207,
258 'rsaquo' => 8250,
259 'rsquo' => 8217,
260 'sbquo' => 8218,
261 'Scaron' => 352,
262 'scaron' => 353,
263 'sdot' => 8901,
264 'sect' => 167,
265 'shy' => 173,
266 'Sigma' => 931,
267 'sigma' => 963,
268 'sigmaf' => 962,
269 'sim' => 8764,
270 'spades' => 9824,
271 'sub' => 8834,
272 'sube' => 8838,
273 'sum' => 8721,
274 'sup' => 8835,
275 'sup1' => 185,
276 'sup2' => 178,
277 'sup3' => 179,
278 'supe' => 8839,
279 'szlig' => 223,
280 'Tau' => 932,
281 'tau' => 964,
282 'there4' => 8756,
283 'Theta' => 920,
284 'theta' => 952,
285 'thetasym' => 977,
286 'thinsp' => 8201,
287 'THORN' => 222,
288 'thorn' => 254,
289 'tilde' => 732,
290 'times' => 215,
291 'trade' => 8482,
292 'Uacute' => 218,
293 'uacute' => 250,
294 'uarr' => 8593,
295 'uArr' => 8657,
296 'Ucirc' => 219,
297 'ucirc' => 251,
298 'Ugrave' => 217,
299 'ugrave' => 249,
300 'uml' => 168,
301 'upsih' => 978,
302 'Upsilon' => 933,
303 'upsilon' => 965,
304 'Uuml' => 220,
305 'uuml' => 252,
306 'weierp' => 8472,
307 'Xi' => 926,
308 'xi' => 958,
309 'Yacute' => 221,
310 'yacute' => 253,
311 'yen' => 165,
312 'Yuml' => 376,
313 'yuml' => 255,
314 'Zeta' => 918,
315 'zeta' => 950,
316 'zwj' => 8205,
317 'zwnj' => 8204 );
318
319 /**
320 * Character entity aliases accepted by MediaWiki
321 */
322 global $wgHtmlEntityAliases;
323 $wgHtmlEntityAliases = array(
324 'רלמ' => 'rlm',
325 'رلم' => 'rlm',
326 );
327
328
329 /**
330 * XHTML sanitizer for MediaWiki
331 * @ingroup Parser
332 */
333 class Sanitizer {
334 /**
335 * Cleans up HTML, removes dangerous tags and attributes, and
336 * removes HTML comments
337 * @private
338 * @param $text String
339 * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values
340 * @param $args Array for the processing callback
341 * @param $extratags Array for any extra tags to include
342 * @param $removetags Array for any tags (default or extra) to exclude
343 * @return string
344 */
345 static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
346 global $wgUseTidy;
347
348 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
349 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
350
351 wfProfileIn( __METHOD__ );
352
353 if ( !$staticInitialised ) {
354
355 $htmlpairsStatic = array( # Tags that must be closed
356 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
357 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
358 'strike', 'strong', 'tt', 'var', 'div', 'center',
359 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
360 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u', 'abbr'
361 );
362 $htmlsingle = array(
363 'br', 'hr', 'li', 'dt', 'dd'
364 );
365 $htmlsingleonly = array( # Elements that cannot have close tags
366 'br', 'hr'
367 );
368 $htmlnest = array( # Tags that can be nested--??
369 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
370 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
371 );
372 $tabletags = array( # Can only appear inside table, we will close them
373 'td', 'th', 'tr',
374 );
375 $htmllist = array( # Tags used by list
376 'ul','ol',
377 );
378 $listtags = array( # Tags that can appear in a list
379 'li',
380 );
381
382 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
383 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
384
385 # Convert them all to hashtables for faster lookup
386 $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
387 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' );
388 foreach ( $vars as $var ) {
389 $$var = array_flip( $$var );
390 }
391 $staticInitialised = true;
392 }
393 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
394 $extratags = array_flip( $extratags );
395 $removetags = array_flip( $removetags );
396 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
397 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
398
399 # Remove HTML comments
400 $text = Sanitizer::removeHTMLcomments( $text );
401 $bits = explode( '<', $text );
402 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
403 if(!$wgUseTidy) {
404 $tagstack = $tablestack = array();
405 foreach ( $bits as $x ) {
406 $regs = array();
407 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
408 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
409 } else {
410 $slash = $t = $params = $brace = $rest = null;
411 }
412
413 $badtag = 0 ;
414 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
415 # Check our stack
416 if ( $slash ) {
417 # Closing a tag...
418 if( isset( $htmlsingleonly[$t] ) ) {
419 $badtag = 1;
420 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
421 if ( isset( $htmlsingleallowed[$ot] ) ) {
422 # Pop all elements with an optional close tag
423 # and see if we find a match below them
424 $optstack = array();
425 array_push ($optstack, $ot);
426 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
427 isset( $htmlsingleallowed[$ot] ) )
428 {
429 array_push ($optstack, $ot);
430 }
431 if ( $t != $ot ) {
432 # No match. Push the optinal elements back again
433 $badtag = 1;
434 while ( $ot = @array_pop( $optstack ) ) {
435 array_push( $tagstack, $ot );
436 }
437 }
438 } else {
439 @array_push( $tagstack, $ot );
440 # <li> can be nested in <ul> or <ol>, skip those cases:
441 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
442 $badtag = 1;
443 }
444 }
445 } else {
446 if ( $t == 'table' ) {
447 $tagstack = array_pop( $tablestack );
448 }
449 }
450 $newparams = '';
451 } else {
452 # Keep track for later
453 if ( isset( $tabletags[$t] ) &&
454 ! in_array( 'table', $tagstack ) ) {
455 $badtag = 1;
456 } else if ( in_array( $t, $tagstack ) &&
457 ! isset( $htmlnest [$t ] ) ) {
458 $badtag = 1 ;
459 # Is it a self closed htmlpair ? (bug 5487)
460 } else if( $brace == '/>' &&
461 isset( $htmlpairs[$t] ) ) {
462 $badtag = 1;
463 } elseif( isset( $htmlsingleonly[$t] ) ) {
464 # Hack to force empty tag for uncloseable elements
465 $brace = '/>';
466 } else if( isset( $htmlsingle[$t] ) ) {
467 # Hack to not close $htmlsingle tags
468 $brace = NULL;
469 } else if( isset( $tabletags[$t] )
470 && in_array($t ,$tagstack) ) {
471 // New table tag but forgot to close the previous one
472 $text .= "</$t>";
473 } else {
474 if ( $t == 'table' ) {
475 array_push( $tablestack, $tagstack );
476 $tagstack = array();
477 }
478 array_push( $tagstack, $t );
479 }
480
481 # Replace any variables or template parameters with
482 # plaintext results.
483 if( is_callable( $processCallback ) ) {
484 call_user_func_array( $processCallback, array( &$params, $args ) );
485 }
486
487 # Strip non-approved attributes from the tag
488 $newparams = Sanitizer::fixTagAttributes( $params, $t );
489 }
490 if ( ! $badtag ) {
491 $rest = str_replace( '>', '&gt;', $rest );
492 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
493 $text .= "<$slash$t$newparams$close>$rest";
494 continue;
495 }
496 }
497 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
498 }
499 # Close off any remaining tags
500 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
501 $text .= "</$t>\n";
502 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
503 }
504 } else {
505 # this might be possible using tidy itself
506 foreach ( $bits as $x ) {
507 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
508 $x, $regs );
509 @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
510 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
511 if( is_callable( $processCallback ) ) {
512 call_user_func_array( $processCallback, array( &$params, $args ) );
513 }
514 $newparams = Sanitizer::fixTagAttributes( $params, $t );
515 $rest = str_replace( '>', '&gt;', $rest );
516 $text .= "<$slash$t$newparams$brace$rest";
517 } else {
518 $text .= '&lt;' . str_replace( '>', '&gt;', $x);
519 }
520 }
521 }
522 wfProfileOut( __METHOD__ );
523 return $text;
524 }
525
526 /**
527 * Remove '<!--', '-->', and everything between.
528 * To avoid leaving blank lines, when a comment is both preceded
529 * and followed by a newline (ignoring spaces), trim leading and
530 * trailing spaces and one of the newlines.
531 *
532 * @private
533 * @param $text String
534 * @return string
535 */
536 static function removeHTMLcomments( $text ) {
537 wfProfileIn( __METHOD__ );
538 while (($start = strpos($text, '<!--')) !== false) {
539 $end = strpos($text, '-->', $start + 4);
540 if ($end === false) {
541 # Unterminated comment; bail out
542 break;
543 }
544
545 $end += 3;
546
547 # Trim space and newline if the comment is both
548 # preceded and followed by a newline
549 $spaceStart = max($start - 1, 0);
550 $spaceLen = $end - $spaceStart;
551 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
552 $spaceStart--;
553 $spaceLen++;
554 }
555 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
556 $spaceLen++;
557 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
558 # Remove the comment, leading and trailing
559 # spaces, and leave only one newline.
560 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
561 }
562 else {
563 # Remove just the comment.
564 $text = substr_replace($text, '', $start, $end - $start);
565 }
566 }
567 wfProfileOut( __METHOD__ );
568 return $text;
569 }
570
571 /**
572 * Take an array of attribute names and values and normalize or discard
573 * illegal values for the given element type.
574 *
575 * - Discards attributes not on a whitelist for the given element
576 * - Unsafe style attributes are discarded
577 * - Invalid id attributes are reencoded
578 *
579 * @param $attribs Array
580 * @param $element String
581 * @return Array
582 *
583 * @todo Check for legal values where the DTD limits things.
584 * @todo Check for unique id attribute :P
585 */
586 static function validateTagAttributes( $attribs, $element ) {
587 return Sanitizer::validateAttributes( $attribs,
588 Sanitizer::attributeWhitelist( $element ) );
589 }
590
591 /**
592 * Take an array of attribute names and values and normalize or discard
593 * illegal values for the given whitelist.
594 *
595 * - Discards attributes not the given whitelist
596 * - Unsafe style attributes are discarded
597 * - Invalid id attributes are reencoded
598 *
599 * @param $attribs Array
600 * @param $whitelist Array: list of allowed attribute names
601 * @return Array
602 *
603 * @todo Check for legal values where the DTD limits things.
604 * @todo Check for unique id attribute :P
605 */
606 static function validateAttributes( $attribs, $whitelist ) {
607 $whitelist = array_flip( $whitelist );
608 $out = array();
609 foreach( $attribs as $attribute => $value ) {
610 if( !isset( $whitelist[$attribute] ) ) {
611 continue;
612 }
613 # Strip javascript "expression" from stylesheets.
614 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
615 if( $attribute == 'style' ) {
616 $value = Sanitizer::checkCss( $value );
617 if( $value === false ) {
618 # haxx0r
619 continue;
620 }
621 }
622
623 if ( $attribute === 'id' ) {
624 global $wgEnforceHtmlIds;
625 $value = Sanitizer::escapeId( $value,
626 $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
627 }
628
629 //RDFa properties allow URIs. check them
630 if ( $attribute === 'rel' || $attribute === 'rev' ||
631 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' ||
632 $attribute === 'datatype' || $attribute === 'typeof' ) {
633 //Paranoia. Allow "simple" values but suppress javascript
634 if ( preg_match( '/(^|\s)javascript\s*:/i', $value ) ) {
635 continue;
636 }
637 }
638
639 // If this attribute was previously set, override it.
640 // Output should only have one attribute of each name.
641 $out[$attribute] = $value;
642 }
643 return $out;
644 }
645
646 /**
647 * Merge two sets of HTML attributes. Conflicting items in the second set
648 * will override those in the first, except for 'class' attributes which
649 * will be combined (if they're both strings).
650 *
651 * @todo implement merging for other attributes such as style
652 * @param $a Array
653 * @param $b Array
654 * @return array
655 */
656 static function mergeAttributes( $a, $b ) {
657 $out = array_merge( $a, $b );
658 if( isset( $a['class'] ) && isset( $b['class'] )
659 && is_string( $a['class'] ) && is_string( $b['class'] )
660 && $a['class'] !== $b['class'] ) {
661 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
662 -1, PREG_SPLIT_NO_EMPTY );
663 $out['class'] = implode( ' ', array_unique( $classes ) );
664 }
665 return $out;
666 }
667
668 /**
669 * Pick apart some CSS and check it for forbidden or unsafe structures.
670 * Returns a sanitized string, or false if it was just too evil.
671 *
672 * Currently URL references, 'expression', 'tps' are forbidden.
673 *
674 * @param $value String
675 * @return Mixed
676 */
677 static function checkCss( $value ) {
678 $stripped = Sanitizer::decodeCharReferences( $value );
679
680 // Remove any comments; IE gets token splitting wrong
681 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
682
683 $value = $stripped;
684
685 // ... and continue checks
686 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
687 'codepointToUtf8(hexdec("$1"))', $stripped );
688 $stripped = str_replace( '\\', '', $stripped );
689 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
690 $stripped ) ) {
691 # haxx0r
692 return false;
693 }
694
695 return $value;
696 }
697
698 /**
699 * Take a tag soup fragment listing an HTML element's attributes
700 * and normalize it to well-formed XML, discarding unwanted attributes.
701 * Output is safe for further wikitext processing, with escaping of
702 * values that could trigger problems.
703 *
704 * - Normalizes attribute names to lowercase
705 * - Discards attributes not on a whitelist for the given element
706 * - Turns broken or invalid entities into plaintext
707 * - Double-quotes all attribute values
708 * - Attributes without values are given the name as attribute
709 * - Double attributes are discarded
710 * - Unsafe style attributes are discarded
711 * - Prepends space if there are attributes.
712 *
713 * @param $text String
714 * @param $element String
715 * @return String
716 */
717 static function fixTagAttributes( $text, $element ) {
718 if( trim( $text ) == '' ) {
719 return '';
720 }
721
722 $stripped = Sanitizer::validateTagAttributes(
723 Sanitizer::decodeTagAttributes( $text ), $element );
724
725 $attribs = array();
726 foreach( $stripped as $attribute => $value ) {
727 $encAttribute = htmlspecialchars( $attribute );
728 $encValue = Sanitizer::safeEncodeAttribute( $value );
729
730 $attribs[] = "$encAttribute=\"$encValue\"";
731 }
732 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
733 }
734
735 /**
736 * Encode an attribute value for HTML output.
737 * @param $text String
738 * @return HTML-encoded text fragment
739 */
740 static function encodeAttribute( $text ) {
741 $encValue = htmlspecialchars( $text, ENT_QUOTES );
742
743 // Whitespace is normalized during attribute decoding,
744 // so if we've been passed non-spaces we must encode them
745 // ahead of time or they won't be preserved.
746 $encValue = strtr( $encValue, array(
747 "\n" => '&#10;',
748 "\r" => '&#13;',
749 "\t" => '&#9;',
750 ) );
751
752 return $encValue;
753 }
754
755 /**
756 * Encode an attribute value for HTML tags, with extra armoring
757 * against further wiki processing.
758 * @param $text String
759 * @return HTML-encoded text fragment
760 */
761 static function safeEncodeAttribute( $text ) {
762 $encValue = Sanitizer::encodeAttribute( $text );
763
764 # Templates and links may be expanded in later parsing,
765 # creating invalid or dangerous output. Suppress this.
766 $encValue = strtr( $encValue, array(
767 '<' => '&lt;', // This should never happen,
768 '>' => '&gt;', // we've received invalid input
769 '"' => '&quot;', // which should have been escaped.
770 '{' => '&#123;',
771 '[' => '&#91;',
772 "''" => '&#39;&#39;',
773 'ISBN' => '&#73;SBN',
774 'RFC' => '&#82;FC',
775 'PMID' => '&#80;MID',
776 '|' => '&#124;',
777 '__' => '&#95;_',
778 ) );
779
780 # Stupid hack
781 $encValue = preg_replace_callback(
782 '/(' . wfUrlProtocols() . ')/',
783 array( 'Sanitizer', 'armorLinksCallback' ),
784 $encValue );
785 return $encValue;
786 }
787
788 /**
789 * Given a value escape it so that it can be used in an id attribute and
790 * return it, this does not validate the value however (see first link)
791 *
792 * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
793 * in the id and
794 * name attributes
795 * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
796 *
797 * @param $id String: id to validate
798 * @param $options Mixed: string or array of strings (default is array()):
799 * 'noninitial': This is a non-initial fragment of an id, not a full id,
800 * so don't pay attention if the first character isn't valid at the
801 * beginning of an id.
802 * 'xml': Don't restrict the id to be HTML4-compatible. This option
803 * allows any alphabetic character to be used, per the XML standard.
804 * Therefore, it also completely changes the type of escaping: instead
805 * of weird dot-encoding, runs of invalid characters (mostly
806 * whitespace) are just compressed into a single underscore.
807 * @return String
808 */
809 static function escapeId( $id, $options = array() ) {
810 $options = (array)$options;
811
812 if ( !in_array( 'xml', $options ) ) {
813 # HTML4-style escaping
814 static $replace = array(
815 '%3A' => ':',
816 '%' => '.'
817 );
818
819 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
820 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
821
822 if ( !preg_match( '/^[a-zA-Z]/', $id )
823 && !in_array( 'noninitial', $options ) ) {
824 // Initial character must be a letter!
825 $id = "x$id";
826 }
827 return $id;
828 }
829
830 # XML-style escaping. For the patterns used, see the XML 1.0 standard,
831 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
832 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
833 . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
834 . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
835 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
836 . '\x{203F}-\x{2040}';
837 # Replace _ as well so we don't get multiple consecutive underscores
838 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
839 $id = trim( $id, '_' );
840
841 if ( !preg_match( "/^[$nameStartChar]/u", $id )
842 && !in_array( 'noninitial', $options ) ) {
843 $id = "_$id";
844 }
845
846 return $id;
847 }
848
849 /**
850 * Given a value, escape it so that it can be used as a CSS class and
851 * return it.
852 *
853 * @todo For extra validity, input should be validated UTF-8.
854 *
855 * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
856 *
857 * @param $class String
858 * @return String
859 */
860 static function escapeClass( $class ) {
861 // Convert ugly stuff to underscores and kill underscores in ugly places
862 return rtrim(preg_replace(
863 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
864 '_',
865 $class ), '_');
866 }
867
868 /**
869 * Given HTML input, escape with htmlspecialchars but un-escape entites.
870 * This allows (generally harmless) entities like &nbsp; to survive.
871 *
872 * @param $html String to escape
873 * @return String: escaped input
874 */
875 static function escapeHtmlAllowEntities( $html ) {
876 # It seems wise to escape ' as well as ", as a matter of course. Can't
877 # hurt.
878 $html = htmlspecialchars( $html, ENT_QUOTES );
879 $html = str_replace( '&amp;', '&', $html );
880 $html = Sanitizer::normalizeCharReferences( $html );
881 return $html;
882 }
883
884 /**
885 * Regex replace callback for armoring links against further processing.
886 * @param $matches Array
887 * @return string
888 */
889 private static function armorLinksCallback( $matches ) {
890 return str_replace( ':', '&#58;', $matches[1] );
891 }
892
893 /**
894 * Return an associative array of attribute names and values from
895 * a partial tag string. Attribute names are forces to lowercase,
896 * character references are decoded to UTF-8 text.
897 *
898 * @param $text String
899 * @return Array
900 */
901 public static function decodeTagAttributes( $text ) {
902 $attribs = array();
903
904 if( trim( $text ) == '' ) {
905 return $attribs;
906 }
907
908 $pairs = array();
909 if( !preg_match_all(
910 MW_ATTRIBS_REGEX,
911 $text,
912 $pairs,
913 PREG_SET_ORDER ) ) {
914 return $attribs;
915 }
916
917 foreach( $pairs as $set ) {
918 $attribute = strtolower( $set[1] );
919 $value = Sanitizer::getTagAttributeCallback( $set );
920
921 // Normalize whitespace
922 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
923 $value = trim( $value );
924
925 // Decode character references
926 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
927 }
928 return $attribs;
929 }
930
931 /**
932 * Pick the appropriate attribute value from a match set from the
933 * MW_ATTRIBS_REGEX matches.
934 *
935 * @param $set Array
936 * @return String
937 */
938 private static function getTagAttributeCallback( $set ) {
939 if( isset( $set[6] ) ) {
940 # Illegal #XXXXXX color with no quotes.
941 return $set[6];
942 } elseif( isset( $set[5] ) ) {
943 # No quotes.
944 return $set[5];
945 } elseif( isset( $set[4] ) ) {
946 # Single-quoted
947 return $set[4];
948 } elseif( isset( $set[3] ) ) {
949 # Double-quoted
950 return $set[3];
951 } elseif( !isset( $set[2] ) ) {
952 # In XHTML, attributes must have a value.
953 # For 'reduced' form, return explicitly the attribute name here.
954 return $set[1];
955 } else {
956 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
957 }
958 }
959
960 /**
961 * Normalize whitespace and character references in an XML source-
962 * encoded text for an attribute value.
963 *
964 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
965 * but note that we're not returning the value, but are returning
966 * XML source fragments that will be slapped into output.
967 *
968 * @param $text String
969 * @return String
970 */
971 private static function normalizeAttributeValue( $text ) {
972 return str_replace( '"', '&quot;',
973 self::normalizeWhitespace(
974 Sanitizer::normalizeCharReferences( $text ) ) );
975 }
976
977 private static function normalizeWhitespace( $text ) {
978 return preg_replace(
979 '/\r\n|[\x20\x0d\x0a\x09]/',
980 ' ',
981 $text );
982 }
983
984 /**
985 * Ensure that any entities and character references are legal
986 * for XML and XHTML specifically. Any stray bits will be
987 * &amp;-escaped to result in a valid text fragment.
988 *
989 * a. any named char refs must be known in XHTML
990 * b. any numeric char refs must be legal chars, not invalid or forbidden
991 * c. use &#x, not &#X
992 * d. fix or reject non-valid attributes
993 *
994 * @param $text String
995 * @return String
996 * @private
997 */
998 static function normalizeCharReferences( $text ) {
999 return preg_replace_callback(
1000 MW_CHAR_REFS_REGEX,
1001 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
1002 $text );
1003 }
1004 /**
1005 * @param $matches String
1006 * @return String
1007 */
1008 static function normalizeCharReferencesCallback( $matches ) {
1009 $ret = null;
1010 if( $matches[1] != '' ) {
1011 $ret = Sanitizer::normalizeEntity( $matches[1] );
1012 } elseif( $matches[2] != '' ) {
1013 $ret = Sanitizer::decCharReference( $matches[2] );
1014 } elseif( $matches[3] != '' ) {
1015 $ret = Sanitizer::hexCharReference( $matches[3] );
1016 } elseif( $matches[4] != '' ) {
1017 $ret = Sanitizer::hexCharReference( $matches[4] );
1018 }
1019 if( is_null( $ret ) ) {
1020 return htmlspecialchars( $matches[0] );
1021 } else {
1022 return $ret;
1023 }
1024 }
1025
1026 /**
1027 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1028 * return the named entity reference as is. If the entity is a
1029 * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
1030 * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
1031 *
1032 * @param $name String
1033 * @return String
1034 */
1035 static function normalizeEntity( $name ) {
1036 global $wgHtmlEntities, $wgHtmlEntityAliases;
1037 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1038 return "&{$wgHtmlEntityAliases[$name]};";
1039 } elseif( isset( $wgHtmlEntities[$name] ) ) {
1040 return "&$name;";
1041 } else {
1042 return "&amp;$name;";
1043 }
1044 }
1045
1046 static function decCharReference( $codepoint ) {
1047 $point = intval( $codepoint );
1048 if( Sanitizer::validateCodepoint( $point ) ) {
1049 return sprintf( '&#%d;', $point );
1050 } else {
1051 return null;
1052 }
1053 }
1054
1055 static function hexCharReference( $codepoint ) {
1056 $point = hexdec( $codepoint );
1057 if( Sanitizer::validateCodepoint( $point ) ) {
1058 return sprintf( '&#x%x;', $point );
1059 } else {
1060 return null;
1061 }
1062 }
1063
1064 /**
1065 * Returns true if a given Unicode codepoint is a valid character in XML.
1066 * @param $codepoint Integer
1067 * @return Boolean
1068 */
1069 private static function validateCodepoint( $codepoint ) {
1070 return ($codepoint == 0x09)
1071 || ($codepoint == 0x0a)
1072 || ($codepoint == 0x0d)
1073 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
1074 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
1075 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
1076 }
1077
1078 /**
1079 * Decode any character references, numeric or named entities,
1080 * in the text and return a UTF-8 string.
1081 *
1082 * @param $text String
1083 * @return String
1084 */
1085 public static function decodeCharReferences( $text ) {
1086 return preg_replace_callback(
1087 MW_CHAR_REFS_REGEX,
1088 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
1089 $text );
1090 }
1091
1092 /**
1093 * @param $matches String
1094 * @return String
1095 */
1096 static function decodeCharReferencesCallback( $matches ) {
1097 if( $matches[1] != '' ) {
1098 return Sanitizer::decodeEntity( $matches[1] );
1099 } elseif( $matches[2] != '' ) {
1100 return Sanitizer::decodeChar( intval( $matches[2] ) );
1101 } elseif( $matches[3] != '' ) {
1102 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
1103 } elseif( $matches[4] != '' ) {
1104 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
1105 }
1106 # Last case should be an ampersand by itself
1107 return $matches[0];
1108 }
1109
1110 /**
1111 * Return UTF-8 string for a codepoint if that is a valid
1112 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1113 * @param $codepoint Integer
1114 * @return String
1115 * @private
1116 */
1117 static function decodeChar( $codepoint ) {
1118 if( Sanitizer::validateCodepoint( $codepoint ) ) {
1119 return codepointToUtf8( $codepoint );
1120 } else {
1121 return UTF8_REPLACEMENT;
1122 }
1123 }
1124
1125 /**
1126 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1127 * return the UTF-8 encoding of that character. Otherwise, returns
1128 * pseudo-entity source (eg &foo;)
1129 *
1130 * @param $name Strings
1131 * @return String
1132 */
1133 static function decodeEntity( $name ) {
1134 global $wgHtmlEntities, $wgHtmlEntityAliases;
1135 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
1136 $name = $wgHtmlEntityAliases[$name];
1137 }
1138 if( isset( $wgHtmlEntities[$name] ) ) {
1139 return codepointToUtf8( $wgHtmlEntities[$name] );
1140 } else {
1141 return "&$name;";
1142 }
1143 }
1144
1145 /**
1146 * Fetch the whitelist of acceptable attributes for a given element name.
1147 *
1148 * @param $element String
1149 * @return Array
1150 */
1151 static function attributeWhitelist( $element ) {
1152 static $list;
1153 if( !isset( $list ) ) {
1154 $list = Sanitizer::setupAttributeWhitelist();
1155 }
1156 return isset( $list[$element] )
1157 ? $list[$element]
1158 : array();
1159 }
1160
1161 /**
1162 * Foreach array key (an allowed HTML element), return an array
1163 * of allowed attributes
1164 * @return Array
1165 */
1166 static function setupAttributeWhitelist() {
1167 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style',
1168 #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1169 'about', 'property', 'resource', 'datatype', 'typeof',
1170 );
1171
1172 $block = array_merge( $common, array( 'align' ) );
1173 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
1174 $tablecell = array( 'abbr',
1175 'axis',
1176 'headers',
1177 'scope',
1178 'rowspan',
1179 'colspan',
1180 'nowrap', # deprecated
1181 'width', # deprecated
1182 'height', # deprecated
1183 'bgcolor' # deprecated
1184 );
1185
1186 # Numbers refer to sections in HTML 4.01 standard describing the element.
1187 # See: http://www.w3.org/TR/html4/
1188 $whitelist = array (
1189 # 7.5.4
1190 'div' => $block,
1191 'center' => $common, # deprecated
1192 'span' => $block, # ??
1193
1194 # 7.5.5
1195 'h1' => $block,
1196 'h2' => $block,
1197 'h3' => $block,
1198 'h4' => $block,
1199 'h5' => $block,
1200 'h6' => $block,
1201
1202 # 7.5.6
1203 # address
1204
1205 # 8.2.4
1206 # bdo
1207
1208 # 9.2.1
1209 'em' => $common,
1210 'strong' => $common,
1211 'cite' => $common,
1212 # dfn
1213 'code' => $common,
1214 # samp
1215 # kbd
1216 'var' => $common,
1217 'abbr' => $common,
1218 # acronym
1219
1220 # 9.2.2
1221 'blockquote' => array_merge( $common, array( 'cite' ) ),
1222 # q
1223
1224 # 9.2.3
1225 'sub' => $common,
1226 'sup' => $common,
1227
1228 # 9.3.1
1229 'p' => $block,
1230
1231 # 9.3.2
1232 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
1233
1234 # 9.3.4
1235 'pre' => array_merge( $common, array( 'width' ) ),
1236
1237 # 9.4
1238 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
1239 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
1240
1241 # 10.2
1242 'ul' => array_merge( $common, array( 'type' ) ),
1243 'ol' => array_merge( $common, array( 'type', 'start' ) ),
1244 'li' => array_merge( $common, array( 'type', 'value' ) ),
1245
1246 # 10.3
1247 'dl' => $common,
1248 'dd' => $common,
1249 'dt' => $common,
1250
1251 # 11.2.1
1252 'table' => array_merge( $common,
1253 array( 'summary', 'width', 'border', 'frame',
1254 'rules', 'cellspacing', 'cellpadding',
1255 'align', 'bgcolor',
1256 ) ),
1257
1258 # 11.2.2
1259 'caption' => array_merge( $common, array( 'align' ) ),
1260
1261 # 11.2.3
1262 'thead' => array_merge( $common, $tablealign ),
1263 'tfoot' => array_merge( $common, $tablealign ),
1264 'tbody' => array_merge( $common, $tablealign ),
1265
1266 # 11.2.4
1267 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1268 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1269
1270 # 11.2.5
1271 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1272
1273 # 11.2.6
1274 'td' => array_merge( $common, $tablecell, $tablealign ),
1275 'th' => array_merge( $common, $tablecell, $tablealign ),
1276
1277 # 13.2
1278 # Not usually allowed, but may be used for extension-style hooks
1279 # such as <math> when it is rasterized
1280 'img' => array_merge( $common, array( 'alt' ) ),
1281
1282 # 15.2.1
1283 'tt' => $common,
1284 'b' => $common,
1285 'i' => $common,
1286 'big' => $common,
1287 'small' => $common,
1288 'strike' => $common,
1289 's' => $common,
1290 'u' => $common,
1291
1292 # 15.2.2
1293 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1294 # basefont
1295
1296 # 15.3
1297 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1298
1299 # XHTML Ruby annotation text module, simple ruby only.
1300 # http://www.w3c.org/TR/ruby/
1301 'ruby' => $common,
1302 # rbc
1303 # rtc
1304 'rb' => $common,
1305 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1306 'rp' => $common,
1307
1308 # MathML root element, where used for extensions
1309 # 'title' may not be 100% valid here; it's XHTML
1310 # http://www.w3.org/TR/REC-MathML/
1311 'math' => array( 'class', 'style', 'id', 'title' ),
1312 );
1313 return $whitelist;
1314 }
1315
1316 /**
1317 * Take a fragment of (potentially invalid) HTML and return
1318 * a version with any tags removed, encoded as plain text.
1319 *
1320 * Warning: this return value must be further escaped for literal
1321 * inclusion in HTML output as of 1.10!
1322 *
1323 * @param $text String: HTML fragment
1324 * @return String
1325 */
1326 static function stripAllTags( $text ) {
1327 # Actual <tags>
1328 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
1329
1330 # Normalize &entities and whitespace
1331 $text = self::decodeCharReferences( $text );
1332 $text = self::normalizeWhitespace( $text );
1333
1334 return $text;
1335 }
1336
1337 /**
1338 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1339 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1340 * PHP 5.1 doesn't.
1341 *
1342 * Use for passing XHTML fragments to PHP's XML parsing functions
1343 *
1344 * @return String
1345 */
1346 static function hackDocType() {
1347 global $wgHtmlEntities;
1348 $out = "<!DOCTYPE html [\n";
1349 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1350 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
1351 }
1352 $out .= "]>\n";
1353 return $out;
1354 }
1355
1356 static function cleanUrl( $url ) {
1357 # Normalize any HTML entities in input. They will be
1358 # re-escaped by makeExternalLink().
1359 $url = Sanitizer::decodeCharReferences( $url );
1360
1361 # Escape any control characters introduced by the above step
1362 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
1363
1364 # Validate hostname portion
1365 $matches = array();
1366 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
1367 list( /* $whole */, $protocol, $host, $rest ) = $matches;
1368
1369 // Characters that will be ignored in IDNs.
1370 // http://tools.ietf.org/html/3454#section-3.1
1371 // Strip them before further processing so blacklists and such work.
1372 $strip = "/
1373 \\s| # general whitespace
1374 \xc2\xad| # 00ad SOFT HYPHEN
1375 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
1376 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
1377 \xe2\x81\xa0| # 2060 WORD JOINER
1378 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
1379 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
1380 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
1381 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
1382 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
1383 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
1384 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
1385 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
1386 /xuD";
1387
1388 $host = preg_replace( $strip, '', $host );
1389
1390 // @fixme: validate hostnames here
1391
1392 return $protocol . $host . $rest;
1393 } else {
1394 return $url;
1395 }
1396 }
1397
1398 }