This Sanitizer::EVIL_URI_PATTERN is completely inadequate for actual security as...

[lhc/web/wiklou.git] / includes / Sanitizer.php
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php

index 401d50f..118f170 100644 (file)
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -39,6 +39,14 @@ class Sanitizer {
                  |&\#[xX]([0-9A-Fa-f]+);
                  |(&)/x';
  
+       /**
+        * Blacklist for evil uris like javascript:
+        * WARNING: DO NOT use this in any place that actually requires blacklisting
+        * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the
+        * only way to be secure from javascript: uri based xss vectors is to whitelist
+        * things that you know are safe and deny everything else.
+        * [1]: http://ha.ckers.org/xss.html
+        */
         const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
         const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
  
@@ -64,7 +72,7 @@ class Sanitizer {
                 'amp'      => 38,
                 'and'      => 8743,
                 'ang'      => 8736,
-               'apos'     => 39,
+               'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
                 'Aring'    => 197,
                 'aring'    => 229,
                 'asymp'    => 8776,
@@ -792,6 +800,10 @@ class Sanitizer {
                 return $value;
         }
  
+       /**
+        * @param $matches array
+        * @return String
+        */
         static function cssDecodeCallback( $matches ) {
                 if ( $matches[1] !== '' ) {
                         // Line continuation
@@ -1093,6 +1105,10 @@ class Sanitizer {
                                 Sanitizer::normalizeCharReferences( $text ) ) );
         }
  
+       /**
+        * @param $text string
+        * @return mixed
+        */
         private static function normalizeWhitespace( $text ) {
                 return preg_replace(
                         '/\r\n|[\x20\x0d\x0a\x09]/',
@@ -1176,6 +1192,10 @@ class Sanitizer {
                 }
         }
  
+       /**
+        * @param $codepoint
+        * @return null|string
+        */
         static function decCharReference( $codepoint ) {
                 $point = intval( $codepoint );
                 if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1185,6 +1205,10 @@ class Sanitizer {
                 }
         }
  
+       /**
+        * @param $codepoint
+        * @return null|string
+        */
         static function hexCharReference( $codepoint ) {
                 $point = hexdec( $codepoint );
                 if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1282,7 +1306,7 @@ class Sanitizer {
          * return the UTF-8 encoding of that character. Otherwise, returns
          * pseudo-entity source (eg &foo;)
          *
-        * @param $name Strings
+        * @param $name String
          * @return String
          */
         static function decodeEntity( $name ) {
@@ -1523,6 +1547,10 @@ class Sanitizer {
                 return $out;
         }
  
+       /**
+        * @param $url string
+        * @return mixed|string
+        */
         static function cleanUrl( $url ) {
                 # Normalize any HTML entities in input. They will be
                 # re-escaped by makeExternalLink().
@@ -1558,7 +1586,7 @@ class Sanitizer {
  
                         $host = preg_replace( $strip, '', $host );
  
-                       // @todo Fixme: validate hostnames here
+                       // @todo FIXME: Validate hostnames here
  
                         return $protocol . $host . $rest;
                 } else {
@@ -1566,7 +1594,63 @@ class Sanitizer {
                 }
         }
  
+       /**
+        * @param $matches array
+        * @return string
+        */
         static function cleanUrlCallback( $matches ) {
                 return urlencode( $matches[0] );
         }
+
+       /**
+        * Does a string look like an e-mail address?
+        *
+        * This validates an email address using an HTML5 specification found at:
+        * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+        * Which as of 2011-01-24 says:
+        *
+        *   A valid e-mail address is a string that matches the ABNF production
+        *   1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
+        *   in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
+        *   3.5.
+        *
+        * This function is an implementation of the specification as requested in
+        * bug 22449.
+        *
+        * Client-side forms will use the same standard validation rules via JS or
+        * HTML 5 validation; additional restrictions can be enforced server-side
+        * by extensions via the 'isValidEmailAddr' hook.
+        *
+        * Note that this validation doesn't 100% match RFC 2822, but is believed
+        * to be liberal enough for wide use. Some invalid addresses will still
+        * pass validation here.
+        *
+        * @since 1.18
+        *
+        * @param $addr String E-mail address
+        * @return Bool
+        */
+       public static function validateEmail( $addr ) {
+               $result = null;
+               if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
+                       return $result;
+               }
+
+               // Please note strings below are enclosed in brackets [], this make the
+               // hyphen "-" a range indicator. Hence it is double backslashed below.
+               // See bug 26948
+               $rfc5322_atext   = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
+               $rfc1034_ldh_str = "a-z0-9\\-" ;
+
+               $HTML5_email_regexp = "/
+               ^                      # start of string
+               [$rfc5322_atext\\.]+    # user part which is liberal :p
+               @                      # 'apostrophe'
+               [$rfc1034_ldh_str]+       # First domain part
+               (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
+               $                      # End of string
+               /ix" ; // case Insensitive, eXtended
+
+               return (bool) preg_match( $HTML5_email_regexp, $addr );
+       }
  }