(bug 12400) Add class to content transcluded from IW source.
[lhc/web/wiklou.git] / includes / parser / Parser.php
index a6d9f93..2f4d702 100644 (file)
@@ -92,7 +92,7 @@ class Parser
        # Persistent:
        var $mTagHooks, $mTransparentTagHooks, $mFunctionHooks, $mFunctionSynonyms, $mVariables,
                $mImageParams, $mImageParamsMagicArray, $mStripList, $mMarkerIndex, $mPreprocessor,
-               $mExtLinkBracketedRegex, $mDefaultStripList, $mVarCache, $mConf;
+               $mExtLinkBracketedRegex, $mUrlProtocols, $mDefaultStripList, $mVarCache, $mConf;
 
 
        # Cleared with clearState():
@@ -128,11 +128,16 @@ class Parser
                $this->mFunctionHooks = array();
                $this->mFunctionSynonyms = array( 0 => array(), 1 => array() );
                $this->mDefaultStripList = $this->mStripList = array( 'nowiki', 'gallery' );
+               $this->mUrlProtocols = wfUrlProtocols();
                $this->mExtLinkBracketedRegex = '/\[(\b(' . wfUrlProtocols() . ')'.
                        '[^][<>"\\x00-\\x20\\x7F]+) *([^\]\\x0a\\x0d]*?)\]/S';
                $this->mVarCache = array();
                if ( isset( $conf['preprocessorClass'] ) ) {
                        $this->mPreprocessorClass = $conf['preprocessorClass'];
+               } elseif ( extension_loaded( 'domxml' ) ) {
+                       // PECL extension that conflicts with the core DOM extension (bug 13770)
+                       wfDebug( "Warning: you have the obsolete domxml extension for PHP. Please remove it!\n" );
+                       $this->mPreprocessorClass = 'Preprocessor_Hash';
                } elseif ( extension_loaded( 'dom' ) ) {
                        $this->mPreprocessorClass = 'Preprocessor_DOM';
                } else {
@@ -1010,58 +1015,120 @@ class Parser
         */
        function doMagicLinks( $text ) {
                wfProfileIn( __METHOD__ );
+               $prots = $this->mUrlProtocols;
+               $urlChar = self::EXT_LINK_URL_CLASS;
                $text = preg_replace_callback(
                        '!(?:                           # Start cases
-                           <a.*?</a> |                 # Skip link text
-                           <.*?> |                     # Skip stuff inside HTML elements
-                           (?:RFC|PMID)\s+([0-9]+) |   # RFC or PMID, capture number as m[1]
-                           ISBN\s+(\b                  # ISBN, capture number as m[2]
-                                     (?: 97[89] [\ \-]? )?   # optional 13-digit ISBN prefix
-                                     (?: [0-9]  [\ \-]? ){9} # 9 digits with opt. delimiters
-                                     [0-9Xx]                 # check digit
-                                   \b)
+                               (<a.*?</a>) |               # m[1]: Skip link text 
+                               (<.*?>) |                   # m[2]: Skip stuff inside HTML elements' . "
+                               (\\b(?:$prots)$urlChar+) |  # m[3]: Free external links" . '
+                               (?:RFC|PMID)\s+([0-9]+) |   # m[4]: RFC or PMID, capture number
+                               ISBN\s+(\b                  # m[5]: ISBN, capture number
+                                   (?: 97[89] [\ \-]? )?   # optional 13-digit ISBN prefix
+                                   (?: [0-9]  [\ \-]? ){9} # 9 digits with opt. delimiters
+                                   [0-9Xx]                 # check digit
+                                   \b)
                        )!x', array( &$this, 'magicLinkCallback' ), $text );
                wfProfileOut( __METHOD__ );
                return $text;
        }
 
        function magicLinkCallback( $m ) {
-               if ( substr( $m[0], 0, 1 ) === '<' ) {
+               if ( isset( $m[1] ) && strval( $m[1] ) !== '' ) {
+                       # Skip anchor
+                       return $m[0];
+               } elseif ( isset( $m[2] ) && strval( $m[2] ) !== '' ) {
                        # Skip HTML element
                        return $m[0];
-               } elseif ( substr( $m[0], 0, 4 ) === 'ISBN' ) {
-                       $isbn = $m[2];
-                       $num = strtr( $isbn, array(
-                               '-' => '',
-                               ' ' => '',
-                               'x' => 'X',
-                       ));
-                       $titleObj = SpecialPage::getTitleFor( 'Booksources', $num );
-                       $text = '<a href="' .
-                               $titleObj->escapeLocalUrl() .
-                               "\" class=\"internal\">ISBN $isbn</a>";
-               } else {
+               } elseif ( isset( $m[3] ) && strval( $m[3] ) !== '' ) {
+                       # Free external link
+                       return $this->makeFreeExternalLink( $m[0] );
+               } elseif ( isset( $m[4] ) && strval( $m[4] ) !== '' ) {
+                       # RFC or PMID
                        if ( substr( $m[0], 0, 3 ) === 'RFC' ) {
                                $keyword = 'RFC';
                                $urlmsg = 'rfcurl';
-                               $id = $m[1];
+                               $id = $m[4];
                        } elseif ( substr( $m[0], 0, 4 ) === 'PMID' ) {
                                $keyword = 'PMID';
                                $urlmsg = 'pubmedurl';
-                               $id = $m[1];
+                               $id = $m[4];
                        } else {
                                throw new MWException( __METHOD__.': unrecognised match type "' .
                                        substr($m[0], 0, 20 ) . '"' );
                        }
-
                        $url = wfMsg( $urlmsg, $id);
                        $sk = $this->mOptions->getSkin();
                        $la = $sk->getExternalLinkAttributes( $url, $keyword.$id );
-                       $text = "<a href=\"{$url}\"{$la}>{$keyword} {$id}</a>";
+                       return "<a href=\"{$url}\"{$la}>{$keyword} {$id}</a>";
+               } elseif ( isset( $m[5] ) && strval( $m[5] ) !== '' ) {
+                       # ISBN
+                       $isbn = $m[5];
+                       $num = strtr( $isbn, array(
+                               '-' => '',
+                               ' ' => '',
+                               'x' => 'X',
+                       ));
+                       $titleObj = SpecialPage::getTitleFor( 'Booksources', $num );
+                       return'<a href="' .
+                               $titleObj->escapeLocalUrl() .
+                               "\" class=\"internal\">ISBN $isbn</a>";
+               } else {
+                       return $m[0];
                }
-               return $text;
        }
 
+       /**
+        * Make a free external link, given a user-supplied URL
+        * @return HTML
+        * @private
+        */
+       function makeFreeExternalLink( $url ) {
+               global $wgContLang;
+               wfProfileIn( __METHOD__ );
+
+               $sk = $this->mOptions->getSkin();
+               $trail = '';
+
+               # The characters '<' and '>' (which were escaped by
+               # removeHTMLtags()) should not be included in
+               # URLs, per RFC 2396.
+               $m2 = array();
+               if (preg_match('/&(lt|gt);/', $url, $m2, PREG_OFFSET_CAPTURE)) {
+                       $trail = substr($url, $m2[0][1]) . $trail;
+                       $url = substr($url, 0, $m2[0][1]);
+               }
+
+               # Move trailing punctuation to $trail
+               $sep = ',;\.:!?';
+               # If there is no left bracket, then consider right brackets fair game too
+               if ( strpos( $url, '(' ) === false ) {
+                       $sep .= ')';
+               }
+
+               $numSepChars = strspn( strrev( $url ), $sep );
+               if ( $numSepChars ) {
+                       $trail = substr( $url, -$numSepChars ) . $trail;
+                       $url = substr( $url, 0, -$numSepChars );
+               }
+
+               $url = Sanitizer::cleanUrl( $url );
+
+               # Is this an external image?
+               $text = $this->maybeMakeExternalImage( $url );
+               if ( $text === false ) {
+                       # Not an image, make a link
+                       $text = $sk->makeExternalLink( $url, $wgContLang->markNoConversion($url), true, 'free', $this->mTitle->getNamespace() );
+                       # Register it in the output object...
+                       # Replace unnecessary URL escape codes with their equivalent characters
+                       $pasteurized = self::replaceUnusualEscapes( $url );
+                       $this->mOutput->addExternalLink( $pasteurized );
+               }
+               wfProfileOut( __METHOD__ );
+               return $text . $trail;
+       }
+
+
        /**
         * Parse headers and return html
         *
@@ -1275,8 +1342,7 @@ class Parser
                $sk = $this->mOptions->getSkin();
 
                $bits = preg_split( $this->mExtLinkBracketedRegex, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
-
-               $s = $this->replaceFreeExternalLinks( array_shift( $bits ) );
+               $s = array_shift( $bits );
 
                $i = 0;
                while ( $i<count( $bits ) ) {
@@ -1327,10 +1393,6 @@ class Parser
 
                        $url = Sanitizer::cleanUrl( $url );
 
-                       # Process the trail (i.e. everything after this link up until start of the next link),
-                       # replacing any non-bracketed links
-                       $trail = $this->replaceFreeExternalLinks( $trail );
-
                        # Use the encoded URL
                        # This means that users can paste URLs directly into the text
                        # Funny characters like &ouml; aren't valid in URLs anyway
@@ -1348,86 +1410,6 @@ class Parser
                return $s;
        }
 
-       /**
-        * Replace anything that looks like a URL with a link
-        * @private
-        */
-       function replaceFreeExternalLinks( $text ) {
-               global $wgContLang;
-               wfProfileIn( __METHOD__ );
-
-               $bits = preg_split( '/(\b(?:' . wfUrlProtocols() . '))/S', $text, -1, PREG_SPLIT_DELIM_CAPTURE );
-               $s = array_shift( $bits );
-               $i = 0;
-
-               $sk = $this->mOptions->getSkin();
-
-               while ( $i < count( $bits ) ){
-                       $protocol = $bits[$i++];
-                       $remainder = $bits[$i++];
-
-                       $m = array();
-                       if ( preg_match( '/^('.self::EXT_LINK_URL_CLASS.'+)(.*)$/s', $remainder, $m ) ) {
-                               # Found some characters after the protocol that look promising
-                               $url = $protocol . $m[1];
-                               $trail = $m[2];
-
-                               # special case: handle urls as url args:
-                               # http://www.example.com/foo?=http://www.example.com/bar
-                               if(strlen($trail) == 0 &&
-                                       isset($bits[$i]) &&
-                                       preg_match('/^'. wfUrlProtocols() . '$/S', $bits[$i]) &&
-                                       preg_match( '/^('.self::EXT_LINK_URL_CLASS.'+)(.*)$/s', $bits[$i + 1], $m ))
-                               {
-                                       # add protocol, arg
-                                       $url .= $bits[$i] . $m[1]; # protocol, url as arg to previous link
-                                       $i += 2;
-                                       $trail = $m[2];
-                               }
-
-                               # The characters '<' and '>' (which were escaped by
-                               # removeHTMLtags()) should not be included in
-                               # URLs, per RFC 2396.
-                               $m2 = array();
-                               if (preg_match('/&(lt|gt);/', $url, $m2, PREG_OFFSET_CAPTURE)) {
-                                       $trail = substr($url, $m2[0][1]) . $trail;
-                                       $url = substr($url, 0, $m2[0][1]);
-                               }
-
-                               # Move trailing punctuation to $trail
-                               $sep = ',;\.:!?';
-                               # If there is no left bracket, then consider right brackets fair game too
-                               if ( strpos( $url, '(' ) === false ) {
-                                       $sep .= ')';
-                               }
-
-                               $numSepChars = strspn( strrev( $url ), $sep );
-                               if ( $numSepChars ) {
-                                       $trail = substr( $url, -$numSepChars ) . $trail;
-                                       $url = substr( $url, 0, -$numSepChars );
-                               }
-
-                               $url = Sanitizer::cleanUrl( $url );
-
-                               # Is this an external image?
-                               $text = $this->maybeMakeExternalImage( $url );
-                               if ( $text === false ) {
-                                       # Not an image, make a link
-                                       $text = $sk->makeExternalLink( $url, $wgContLang->markNoConversion($url), true, 'free', $this->mTitle->getNamespace() );
-                                       # Register it in the output object...
-                                       # Replace unnecessary URL escape codes with their equivalent characters
-                                       $pasteurized = self::replaceUnusualEscapes( $url );
-                                       $this->mOutput->addExternalLink( $pasteurized );
-                               }
-                               $s .= $text . $trail;
-                       } else {
-                               $s .= $protocol . $remainder;
-                       }
-               }
-               wfProfileOut( __METHOD__ );
-               return $s;
-       }
-
        /**
         * Replace unusual URL escape codes with their equivalent characters
         * @param string
@@ -1464,7 +1446,7 @@ class Parser
 
        /**
         * make an image if it's allowed, either through the global
-        * option or through the exception
+        * option, through the exception, or through the on-wiki whitelist
         * @private
         */
        function maybeMakeExternalImage( $url ) {
@@ -1472,13 +1454,41 @@ class Parser
                $imagesfrom = $this->mOptions->getAllowExternalImagesFrom();
                $imagesexception = !empty($imagesfrom);
                $text = false;
+               # $imagesfrom could be either a single string or an array of strings, parse out the latter
+               if( $imagesexception && is_array( $imagesfrom ) ) {
+                       $imagematch = false;
+                       foreach( $imagesfrom as $match ) {
+                               if( strpos( $url, $match ) === 0 ) {
+                                       $imagematch = true;
+                                       break;
+                               }
+                       }
+               } elseif( $imagesexception ) {
+                       $imagematch = (strpos( $url, $imagesfrom ) === 0);
+               } else {
+                       $imagematch = false;
+               }
                if ( $this->mOptions->getAllowExternalImages()
-                    || ( $imagesexception && strpos( $url, $imagesfrom ) === 0 ) ) {
+                    || ( $imagesexception && $imagematch ) ) {
                        if ( preg_match( self::EXT_IMAGE_REGEX, $url ) ) {
                                # Image found
                                $text = $sk->makeExternalImage( $url );
                        }
                }
+               if( !$text && $this->mOptions->getEnableImageWhitelist()
+                        && preg_match( self::EXT_IMAGE_REGEX, $url ) ) {
+                       $whitelist = explode( "\n", wfMsgForContent( 'external_image_whitelist' ) );
+                       foreach( $whitelist as $entry ) {
+                               # Sanitize the regex fragment, make it case-insensitive, ignore blank entries/comments
+                               if( strpos( $entry, '#' ) === 0 || $entry === '' )
+                                       continue;
+                               if( preg_match( '/' . str_replace( '/', '\\/', $entry ) . '/i', $url ) ) {
+                                       # Image matches a whitelist entry
+                                       $text = $sk->makeExternalImage( $url );
+                                       break;
+                               }
+                       }
+               }
                return $text;
        }
 
@@ -1851,10 +1861,8 @@ class Parser
         * Insert a NOPARSE hacky thing into any inline links in a chunk that's
         * going to go through further parsing steps before inline URL expansion.
         *
-        * In particular this is important when using action=render, which causes
-        * full URLs to be included.
-        *
-        * Oh man I hate our multi-layer parser!
+        * Not needed quite as much as it used to be since free links are a bit
+        * more sensible these days. But bracketed links are still an issue.
         *
         * @param string more-or-less HTML
         * @return string less-or-more HTML with NOPARSE bits
@@ -3153,7 +3161,7 @@ class Parser
 
                if (strlen($url) > 255)
                        return wfMsg('scarytranscludetoolong');
-               return $this->fetchScaryTemplateMaybeFromCache($url);
+               return "<div class=\"mw-iw-transclusion\">\n" . $this->fetchScaryTemplateMaybeFromCache($url) . "</div>\n";
        }
 
        function fetchScaryTemplateMaybeFromCache($url) {
@@ -3738,11 +3746,13 @@ class Parser
                $nc = '[ _0-9A-Za-z\x80-\xff-]'; # Namespaces can use non-ascii!
 
                $p1 = "/\[\[(:?$nc+:|:|)($tc+?)( \\($tc+\\))\\|]]/";            # [[ns:page (context)|]]
+               $p4 = "/\[\[(:?$nc+:|:|)($tc+?)(($tc+))\\|]]/";             # [[ns:page(context)|]]
                $p3 = "/\[\[(:?$nc+:|:|)($tc+?)( \\($tc+\\)|)(, $tc+|)\\|]]/";  # [[ns:page (context), context|]]
                $p2 = "/\[\[\\|($tc+)]]/";                                      # [[|page]]
 
                # try $p1 first, to turn "[[A, B (C)|]]" into "[[A, B (C)|A, B]]"
                $text = preg_replace( $p1, '[[\\1\\2\\3|\\2]]', $text );
+               $text = preg_replace( $p4, '[[\\1\\2\\3|\\2]]', $text );
                $text = preg_replace( $p3, '[[\\1\\2\\3\\4|\\2]]', $text );
 
                $t = $this->mTitle->getText();
@@ -4116,7 +4126,7 @@ class Parser
                        
                        if ( strpos( $matches[0], '%' ) !== false )
                                $matches[1] = urldecode( $matches[1] );
-                       $tp = Title::newFromText( $matches[1], NS_IMAGE );
+                       $tp = Title::newFromText( $matches[1]/*, NS_IMAGE*/ );
                        $nt =& $tp;
                        if( is_null( $nt ) ) {
                                # Bogus title. Ignore these so we don't bomb out later.