4 * File for Parser and related classes
11 * Variable substitution O(N^2) attack
13 * Without countermeasures, it would be possible to attack the parser by saving
14 * a page filled with a large number of inclusions of large pages. The size of
15 * the generated page would be proportional to the square of the input size.
16 * Hence, we limit the number of inclusions of any given page, thus bringing any
17 * attack back to O(N).
20 define( 'MAX_INCLUDE_REPEAT', 100 );
21 define( 'MAX_INCLUDE_SIZE', 1000000 ); // 1 Million
23 # Allowed values for $mOutputType
24 define( 'OT_HTML', 1 );
25 define( 'OT_WIKI', 2 );
26 define( 'OT_MSG' , 3 );
28 # string parameter for extractTags which will cause it
29 # to strip HTML comments in addition to regular
30 # <XML>-style tags. This should not be anything we
31 # may want to use in wikisyntax
32 define( 'STRIP_COMMENTS', 'HTMLCommentStrip' );
34 # prefix for escaping, used in two functions at least
35 define( 'UNIQ_PREFIX', 'NaodW29');
37 # Constants needed for external link processing
38 define( 'URL_PROTOCOLS', 'http|https|ftp|irc|gopher|news|mailto' );
39 define( 'HTTP_PROTOCOLS', 'http|https' );
40 # Everything except bracket, space, or control characters
41 define( 'EXT_LINK_URL_CLASS', '[^]<>\\x00-\\x20\\x7F]' );
43 define( 'EXT_LINK_TEXT_CLASS', '[^\]\\x00-\\x1F\\x7F]' );
44 define( 'EXT_IMAGE_FNAME_CLASS', '[A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]' );
45 define( 'EXT_IMAGE_EXTENSIONS', 'gif|png|jpg|jpeg' );
46 define( 'EXT_LINK_BRACKETED', '/\[(('.URL_PROTOCOLS
.'):'.EXT_LINK_URL_CLASS
.'+) *('.EXT_LINK_TEXT_CLASS
.'*?)\]/S' );
47 define( 'EXT_IMAGE_REGEX',
48 '/^('.HTTP_PROTOCOLS
.':)'. # Protocol
49 '('.EXT_LINK_URL_CLASS
.'+)\\/'. # Hostname and path
50 '('.EXT_IMAGE_FNAME_CLASS
.'+)\\.((?i)'.EXT_IMAGE_EXTENSIONS
.')$/S' # Filename
56 * Processes wiki markup
59 * There are three main entry points into the Parser class:
61 * produces HTML output
63 * produces altered wiki markup.
65 * performs brace substitution on MediaWiki messages
68 * objects: $wgLang, $wgDateFormatter, $wgLinkCache, $wgCurParser
70 * NOT $wgArticle, $wgUser or $wgTitle. Keep them away!
73 * $wgUseTex*, $wgUseDynamicDates*, $wgInterwikiMagic*,
74 * $wgNamespacesWithSubpages, $wgAllowExternalImages*,
77 * * only within ParserOptions
90 # Cleared with clearState():
91 var $mOutput, $mAutonumber, $mDTopen, $mStripState = array();
92 var $mVariables, $mIncludeCount, $mArgStack, $mLastSection, $mInPre;
95 var $mOptions, $mTitle, $mOutputType,
96 $mTemplates, // cache of already loaded templates, avoids
97 // multiple SQL queries for the same string
98 $mTemplatePath; // stores an unsorted hash of all the templates already loaded
99 // in this path. Used for loop detection.
109 $this->mTemplates
= array();
110 $this->mTemplatePath
= array();
111 $this->mTagHooks
= array();
120 function clearState() {
121 $this->mOutput
= new ParserOutput
;
122 $this->mAutonumber
= 0;
123 $this->mLastSection
= "";
124 $this->mDTopen
= false;
125 $this->mVariables
= false;
126 $this->mIncludeCount
= array();
127 $this->mStripState
= array();
128 $this->mArgStack
= array();
129 $this->mInPre
= false;
133 * First pass--just handle <nowiki> sections, pass the rest off
134 * to internalParse() which does all the real work.
137 * @return ParserOutput a ParserOutput
139 function parse( $text, &$title, $options, $linestart = true, $clearState = true ) {
140 global $wgUseTidy, $wgContLang;
141 $fname = 'Parser::parse';
142 wfProfileIn( $fname );
148 $this->mOptions
= $options;
149 $this->mTitle
=& $title;
150 $this->mOutputType
= OT_HTML
;
153 $text = $this->strip( $text, $this->mStripState
);
155 $text = $this->internalParse( $text, $linestart );
156 $text = $this->unstrip( $text, $this->mStripState
);
157 # Clean up special characters, only run once, next-to-last before doBlockLevels
160 # french spaces, last one Guillemet-left
161 # only if there is something before the space
162 '/(.) (?=\\?|:|;|!|\\302\\273)/i' => '\\1 \\2',
163 # french spaces, Guillemet-right
164 "/(\\302\\253) /i"=>"\\1 ",
165 '/<hr *>/i' => '<hr />',
166 '/<br *>/i' => '<br />',
167 '/<center *>/i' => '<div class="center">',
168 '/<\\/center *>/i' => '</div>',
169 # Clean up spare ampersands; note that we probably ought to be
170 # more careful about named entities.
171 '/&(?!:amp;|#[Xx][0-9A-fa-f]+;|#[0-9]+;|[a-zA-Z0-9]+;)/' => '&'
173 $text = preg_replace( array_keys($fixtags), array_values($fixtags), $text );
176 # french spaces, last one Guillemet-left
177 '/ (\\?|:|;|!|\\302\\273)/i' => ' \\1',
178 # french spaces, Guillemet-right
179 '/(\\302\\253) /i' => '\\1 ',
180 '/<center *>/i' => '<div class="center">',
181 '/<\\/center *>/i' => '</div>'
183 $text = preg_replace( array_keys($fixtags), array_values($fixtags), $text );
186 $text = $this->doBlockLevels( $text, $linestart );
188 $text = $wgContLang->convert($text);
190 $text = $this->unstripNoWiki( $text, $this->mStripState
);
191 $this->mOutput
->setText( $text );
192 wfProfileOut( $fname );
193 return $this->mOutput
;
197 * Get a random string
202 function getRandomString() {
203 return dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
207 * Replaces all occurrences of <$tag>content</$tag> in the text
208 * with a random marker and returns the new text. the output parameter
209 * $content will be an associative array filled with data on the form
210 * $unique_marker => content.
212 * If $content is already set, the additional entries will be appended
213 * If $tag is set to STRIP_COMMENTS, the function will extract
214 * <!-- HTML comments -->
219 function extractTags($tag, $text, &$content, $uniq_prefix = ''){
220 $rnd = $uniq_prefix . '-' . $tag . Parser
::getRandomString();
227 while ( '' != $text ) {
228 if($tag==STRIP_COMMENTS
) {
229 $p = preg_split( '/<!--/i', $text, 2 );
231 $p = preg_split( "/<\\s*$tag\\s*>/i", $text, 2 );
234 if ( ( count( $p ) < 2 ) ||
( '' == $p[1] ) ) {
237 if($tag==STRIP_COMMENTS
) {
238 $q = preg_split( '/-->/i', $p[1], 2 );
240 $q = preg_split( "/<\\/\\s*$tag\\s*>/i", $p[1], 2 );
242 $marker = $rnd . sprintf('%08X', $n++
);
243 $content[$marker] = $q[0];
244 $stripped .= $marker;
252 * Strips and renders nowiki, pre, math, hiero
253 * If $render is set, performs necessary rendering operations on plugins
254 * Returns the text, and fills an array with data needed in unstrip()
255 * If the $state is already a valid strip state, it adds to the state
257 * @param bool $stripcomments when set, HTML comments <!-- like this -->
258 * will be stripped in addition to other tags. This is important
259 * for section editing, where these comments cause confusion when
260 * counting the sections in the wikisource
264 function strip( $text, &$state, $stripcomments = false ) {
265 $render = ($this->mOutputType
== OT_HTML
);
266 $html_content = array();
267 $nowiki_content = array();
268 $math_content = array();
269 $pre_content = array();
270 $comment_content = array();
271 $ext_content = array();
273 # Replace any instances of the placeholders
274 $uniq_prefix = UNIQ_PREFIX
;
275 #$text = str_replace( $uniq_prefix, wfHtmlEscapeFirst( $uniq_prefix ), $text );
278 global $wgRawHtml, $wgWhitelistEdit;
279 if( $wgRawHtml && $wgWhitelistEdit ) {
280 $text = Parser
::extractTags('html', $text, $html_content, $uniq_prefix);
281 foreach( $html_content as $marker => $content ) {
283 # Raw and unchecked for validity.
284 $html_content[$marker] = $content;
286 $html_content[$marker] = '<html>'.$content.'</html>';
292 $text = Parser
::extractTags('nowiki', $text, $nowiki_content, $uniq_prefix);
293 foreach( $nowiki_content as $marker => $content ) {
295 $nowiki_content[$marker] = wfEscapeHTMLTagsOnly( $content );
297 $nowiki_content[$marker] = '<nowiki>'.$content.'</nowiki>';
302 $text = Parser
::extractTags('math', $text, $math_content, $uniq_prefix);
303 foreach( $math_content as $marker => $content ){
305 if( $this->mOptions
->getUseTeX() ) {
306 $math_content[$marker] = renderMath( $content );
308 $math_content[$marker] = '<math>'.$content.'<math>';
311 $math_content[$marker] = '<math>'.$content.'</math>';
316 $text = Parser
::extractTags('pre', $text, $pre_content, $uniq_prefix);
317 foreach( $pre_content as $marker => $content ){
319 $pre_content[$marker] = '<pre>' . wfEscapeHTMLTagsOnly( $content ) . '</pre>';
321 $pre_content[$marker] = '<pre>'.$content.'</pre>';
327 $text = Parser
::extractTags(STRIP_COMMENTS
, $text, $comment_content, $uniq_prefix);
328 foreach( $comment_content as $marker => $content ){
329 $comment_content[$marker] = '<!--'.$content.'-->';
334 foreach ( $this->mTagHooks
as $tag => $callback ) {
335 $ext_contents[$tag] = array();
336 $text = Parser
::extractTags( $tag, $text, $ext_content[$tag], $uniq_prefix );
337 foreach( $ext_content[$tag] as $marker => $content ) {
339 $ext_content[$tag][$marker] = $callback( $content );
341 $ext_content[$tag][$marker] = "<$tag>$content</$tag>";
346 # Merge state with the pre-existing state, if there is one
348 $state['html'] = $state['html'] +
$html_content;
349 $state['nowiki'] = $state['nowiki'] +
$nowiki_content;
350 $state['math'] = $state['math'] +
$math_content;
351 $state['pre'] = $state['pre'] +
$pre_content;
352 $state['comment'] = $state['comment'] +
$comment_content;
354 foreach( $ext_content as $tag => $array ) {
355 if ( array_key_exists( $tag, $state ) ) {
356 $state[$tag] = $state[$tag] +
$array;
361 'html' => $html_content,
362 'nowiki' => $nowiki_content,
363 'math' => $math_content,
364 'pre' => $pre_content,
365 'comment' => $comment_content,
372 * restores pre, math, and heiro removed by strip()
374 * always call unstripNoWiki() after this one
377 function unstrip( $text, &$state ) {
378 # Must expand in reverse order, otherwise nested tags will be corrupted
379 $contentDict = end( $state );
380 for ( $contentDict = end( $state ); $contentDict !== false; $contentDict = prev( $state ) ) {
381 if( key($state) != 'nowiki' && key($state) != 'html') {
382 for ( $content = end( $contentDict ); $content !== false; $content = prev( $contentDict ) ) {
383 $text = str_replace( key( $contentDict ), $content, $text );
392 * always call this after unstrip() to preserve the order
396 function unstripNoWiki( $text, &$state ) {
397 # Must expand in reverse order, otherwise nested tags will be corrupted
398 for ( $content = end($state['nowiki']); $content !== false; $content = prev( $state['nowiki'] ) ) {
399 $text = str_replace( key( $state['nowiki'] ), $content, $text );
404 for ( $content = end($state['html']); $content !== false; $content = prev( $state['html'] ) ) {
405 $text = str_replace( key( $state['html'] ), $content, $text );
413 * Add an item to the strip state
414 * Returns the unique tag which must be inserted into the stripped text
415 * The tag will be replaced with the original text in unstrip()
419 function insertStripItem( $text, &$state ) {
420 $rnd = UNIQ_PREFIX
. '-item' . Parser
::getRandomString();
429 $state['item'][$rnd] = $text;
434 * Return allowed HTML attributes
438 function getHTMLattrs () {
439 $htmlattrs = array( # Allowed attributes--no scripting, etc.
440 'title', 'align', 'lang', 'dir', 'width', 'height',
441 'bgcolor', 'clear', /* BR */ 'noshade', /* HR */
442 'cite', /* BLOCKQUOTE, Q */ 'size', 'face', 'color',
443 /* FONT */ 'type', 'start', 'value', 'compact',
444 /* For various lists, mostly deprecated but safe */
445 'summary', 'width', 'border', 'frame', 'rules',
446 'cellspacing', 'cellpadding', 'valign', 'char',
447 'charoff', 'colgroup', 'col', 'span', 'abbr', 'axis',
448 'headers', 'scope', 'rowspan', 'colspan', /* Tables */
449 'id', 'class', 'name', 'style' /* For CSS */
455 * Remove non approved attributes and javascript in css
459 function fixTagAttributes ( $t ) {
460 if ( trim ( $t ) == '' ) return '' ; # Saves runtime ;-)
461 $htmlattrs = $this->getHTMLattrs() ;
463 # Strip non-approved attributes from the tag
465 '/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e',
466 "(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')",
469 $t = str_replace ( '<></>' , '' , $t ) ; # This should fix bug 980557
471 # Strip javascript "expression" from stylesheets. Brute force approach:
472 # If anythin offensive is found, all attributes of the HTML tag are dropped
475 '/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is',
476 wfMungeToUtf8( $t ) ) )
485 * interface with html tidy, used if $wgUseTidy = true
490 function tidy ( $text ) {
491 global $wgTidyConf, $wgTidyBin, $wgTidyOpts;
492 global $wgInputEncoding, $wgOutputEncoding;
493 $fname = 'Parser::tidy';
494 wfProfileIn( $fname );
498 switch(strtoupper($wgOutputEncoding)) {
500 $opts .= ($wgInputEncoding == $wgOutputEncoding)?
' -latin1':' -raw';
503 $opts .= ($wgInputEncoding == $wgOutputEncoding)?
' -utf8':' -raw';
509 $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
510 ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>'.
511 '<head><title>test</title></head><body>'.$text.'</body></html>';
512 $descriptorspec = array(
513 0 => array('pipe', 'r'),
514 1 => array('pipe', 'w'),
515 2 => array('file', '/dev/null', 'a')
517 $process = proc_open("$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes);
518 if (is_resource($process)) {
519 fwrite($pipes[0], $wrappedtext);
521 while (!feof($pipes[1])) {
522 $cleansource .= fgets($pipes[1], 1024);
525 $return_value = proc_close($process);
528 wfProfileOut( $fname );
530 if( $cleansource == '' && $text != '') {
531 wfDebug( "Tidy error detected!\n" );
532 return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
539 * parse the wiki syntax used to render tables
543 function doTableStuff ( $t ) {
544 $fname = 'Parser::doTableStuff';
545 wfProfileIn( $fname );
547 $t = explode ( "\n" , $t ) ;
548 $td = array () ; # Is currently a td tag open?
549 $ltd = array () ; # Was it TD or TH?
550 $tr = array () ; # Is currently a tr tag open?
551 $ltr = array () ; # tr attributes
552 $indent_level = 0; # indent level of the table
553 foreach ( $t AS $k => $x )
556 $fc = substr ( $x , 0 , 1 ) ;
557 if ( preg_match( '/^(:*)\{\|(.*)$/', $x, $matches ) ) {
558 $indent_level = strlen( $matches[1] );
560 str_repeat( '<dl><dd>', $indent_level ) .
561 '<table ' . $this->fixTagAttributes ( $matches[2] ) . '>' ;
562 array_push ( $td , false ) ;
563 array_push ( $ltd , '' ) ;
564 array_push ( $tr , false ) ;
565 array_push ( $ltr , '' ) ;
567 else if ( count ( $td ) == 0 ) { } # Don't do any of the following
568 else if ( '|}' == substr ( $x , 0 , 2 ) ) {
570 $l = array_pop ( $ltd ) ;
571 if ( array_pop ( $tr ) ) $z = '</tr>' . $z ;
572 if ( array_pop ( $td ) ) $z = '</'.$l.'>' . $z ;
574 $t[$k] = $z . str_repeat( '</dd></dl>', $indent_level );
576 else if ( '|-' == substr ( $x , 0 , 2 ) ) { # Allows for |---------------
577 $x = substr ( $x , 1 ) ;
578 while ( $x != '' && substr ( $x , 0 , 1 ) == '-' ) $x = substr ( $x , 1 ) ;
580 $l = array_pop ( $ltd ) ;
581 if ( array_pop ( $tr ) ) $z = '</tr>' . $z ;
582 if ( array_pop ( $td ) ) $z = '</'.$l.'>' . $z ;
585 array_push ( $tr , false ) ;
586 array_push ( $td , false ) ;
587 array_push ( $ltd , '' ) ;
588 array_push ( $ltr , $this->fixTagAttributes ( $x ) ) ;
590 else if ( '|' == $fc ||
'!' == $fc ||
'|+' == substr ( $x , 0 , 2 ) ) { # Caption
592 if ( '|+' == substr ( $x , 0 , 2 ) ) {
594 $x = substr ( $x , 1 ) ;
596 $after = substr ( $x , 1 ) ;
597 if ( $fc == '!' ) $after = str_replace ( '!!' , '||' , $after ) ;
598 $after = explode ( '||' , $after ) ;
601 # Loop through each table cell
602 foreach ( $after AS $theline )
607 $tra = array_pop ( $ltr ) ;
608 if ( !array_pop ( $tr ) ) $z = '<tr '.$tra.">\n" ;
609 array_push ( $tr , true ) ;
610 array_push ( $ltr , '' ) ;
613 $l = array_pop ( $ltd ) ;
614 if ( array_pop ( $td ) ) $z = '</'.$l.'>' . $z ;
615 if ( $fc == '|' ) $l = 'td' ;
616 else if ( $fc == '!' ) $l = 'th' ;
617 else if ( $fc == '+' ) $l = 'caption' ;
619 array_push ( $ltd , $l ) ;
622 $y = explode ( '|' , $theline , 2 ) ;
623 # Note that a '|' inside an invalid link should not
624 # be mistaken as delimiting cell parameters
625 if ( strpos( $y[0], '[[' ) !== false ) {
626 $y = array ($theline);
628 if ( count ( $y ) == 1 )
629 $y = "{$z}<{$l}>{$y[0]}" ;
630 else $y = $y = "{$z}<{$l} ".$this->fixTagAttributes($y[0]).">{$y[1]}" ;
632 array_push ( $td , true ) ;
637 # Closing open td, tr && table
638 while ( count ( $td ) > 0 )
640 if ( array_pop ( $td ) ) $t[] = '</td>' ;
641 if ( array_pop ( $tr ) ) $t[] = '</tr>' ;
645 $t = implode ( "\n" , $t ) ;
646 # $t = $this->removeHTMLtags( $t );
647 wfProfileOut( $fname );
652 * Helper function for parse() that transforms wiki markup into
653 * HTML. Only called for $mOutputType == OT_HTML.
657 function internalParse( $text, $linestart, $args = array(), $isMain=true ) {
660 $fname = 'Parser::internalParse';
661 wfProfileIn( $fname );
663 $text = $this->removeHTMLtags( $text );
664 $text = $this->replaceVariables( $text, $args );
666 $text = preg_replace( '/(^|\n)-----*/', '\\1<hr />', $text );
668 $text = $this->doHeadings( $text );
669 if($this->mOptions
->getUseDynamicDates()) {
670 global $wgDateFormatter;
671 $text = $wgDateFormatter->reformat( $this->mOptions
->getDateFormat(), $text );
673 $text = $this->doAllQuotes( $text );
674 $text = $this->replaceInternalLinks ( $text );
675 $text = $this->replaceExternalLinks( $text );
676 $text = $this->doMagicLinks( $text );
677 $text = $this->doTableStuff( $text );
678 $text = $this->formatHeadings( $text, $isMain );
679 $sk =& $this->mOptions
->getSkin();
680 $text = $sk->transformContent( $text );
682 wfProfileOut( $fname );
687 * Replace special strings like "ISBN xxx" and "RFC xxx" with
688 * magic external links.
692 function &doMagicLinks( &$text ) {
693 global $wgUseGeoMode;
694 $text = $this->magicISBN( $text );
695 if ( isset( $wgUseGeoMode ) && $wgUseGeoMode ) {
696 $text = $this->magicGEO( $text );
698 $text = $this->magicRFC( $text );
703 * Parse ^^ tokens and return html
707 function doExponent ( $text ) {
708 $fname = 'Parser::doExponent';
709 wfProfileIn( $fname);
710 $text = preg_replace('/\^\^(.*)\^\^/','<small><sup>\\1</sup></small>', $text);
711 wfProfileOut( $fname);
716 * Parse headers and return html
720 function doHeadings( $text ) {
721 $fname = 'Parser::doHeadings';
722 wfProfileIn( $fname );
723 for ( $i = 6; $i >= 1; --$i ) {
724 $h = substr( '======', 0, $i );
725 $text = preg_replace( "/^{$h}(.+){$h}(\\s|$)/m",
726 "<h{$i}>\\1</h{$i}>\\2", $text );
728 wfProfileOut( $fname );
733 * Replace single quotes with HTML markup
735 * @return string the altered text
737 function doAllQuotes( $text ) {
738 $fname = 'Parser::doAllQuotes';
739 wfProfileIn( $fname );
741 $lines = explode( "\n", $text );
742 foreach ( $lines as $line ) {
743 $outtext .= $this->doQuotes ( $line ) . "\n";
745 $outtext = substr($outtext, 0,-1);
746 wfProfileOut( $fname );
751 * Helper function for doAllQuotes()
754 function doQuotes( $text ) {
755 $arr = preg_split ("/(''+)/", $text, -1, PREG_SPLIT_DELIM_CAPTURE
);
756 if (count ($arr) == 1)
760 # First, do some preliminary work. This may shift some apostrophes from
761 # being mark-up to being text. It also counts the number of occurrences
762 # of bold and italics mark-ups.
770 # If there are ever four apostrophes, assume the first is supposed to
771 # be text, and the remaining three constitute mark-up for bold text.
772 if (strlen ($arr[$i]) == 4)
777 # If there are more than 5 apostrophes in a row, assume they're all
778 # text except for the last 5.
779 else if (strlen ($arr[$i]) > 5)
781 $arr[$i-1] .= str_repeat ("'", strlen ($arr[$i]) - 5);
784 # Count the number of occurrences of bold and italics mark-ups.
785 # We are not counting sequences of five apostrophes.
786 if (strlen ($arr[$i]) == 2) $numitalics++
; else
787 if (strlen ($arr[$i]) == 3) $numbold++
; else
788 if (strlen ($arr[$i]) == 5) { $numitalics++
; $numbold++
; }
793 # If there is an odd number of both bold and italics, it is likely
794 # that one of the bold ones was meant to be an apostrophe followed
795 # by italics. Which one we cannot know for certain, but it is more
796 # likely to be one that has a single-letter word before it.
797 if (($numbold %
2 == 1) && ($numitalics %
2 == 1))
800 $firstsingleletterword = -1;
801 $firstmultiletterword = -1;
805 if (($i %
2 == 1) and (strlen ($r) == 3))
807 $x1 = substr ($arr[$i-1], -1);
808 $x2 = substr ($arr[$i-1], -2, 1);
810 if ($firstspace == -1) $firstspace = $i;
811 } else if ($x2 == ' ') {
812 if ($firstsingleletterword == -1) $firstsingleletterword = $i;
814 if ($firstmultiletterword == -1) $firstmultiletterword = $i;
820 # If there is a single-letter word, use it!
821 if ($firstsingleletterword > -1)
823 $arr [ $firstsingleletterword ] = "''";
824 $arr [ $firstsingleletterword-1 ] .= "'";
826 # If not, but there's a multi-letter word, use that one.
827 else if ($firstmultiletterword > -1)
829 $arr [ $firstmultiletterword ] = "''";
830 $arr [ $firstmultiletterword-1 ] .= "'";
832 # ... otherwise use the first one that has neither.
833 # (notice that it is possible for all three to be -1 if, for example,
834 # there is only one pentuple-apostrophe in the line)
835 else if ($firstspace > -1)
837 $arr [ $firstspace ] = "''";
838 $arr [ $firstspace-1 ] .= "'";
842 # Now let's actually convert our apostrophic mush to HTML!
851 if ($state == 'both')
858 if (strlen ($r) == 2)
861 { $output .= '</i>'; $state = ''; }
862 else if ($state == 'bi')
863 { $output .= '</i>'; $state = 'b'; }
864 else if ($state == 'ib')
865 { $output .= '</b></i><b>'; $state = 'b'; }
866 else if ($state == 'both')
867 { $output .= '<b><i>'.$buffer.'</i>'; $state = 'b'; }
868 else # $state can be 'b' or ''
869 { $output .= '<i>'; $state .= 'i'; }
871 else if (strlen ($r) == 3)
874 { $output .= '</b>'; $state = ''; }
875 else if ($state == 'bi')
876 { $output .= '</i></b><i>'; $state = 'i'; }
877 else if ($state == 'ib')
878 { $output .= '</b>'; $state = 'i'; }
879 else if ($state == 'both')
880 { $output .= '<i><b>'.$buffer.'</b>'; $state = 'i'; }
881 else # $state can be 'i' or ''
882 { $output .= '<b>'; $state .= 'b'; }
884 else if (strlen ($r) == 5)
887 { $output .= '</b><i>'; $state = 'i'; }
888 else if ($state == 'i')
889 { $output .= '</i><b>'; $state = 'b'; }
890 else if ($state == 'bi')
891 { $output .= '</i></b>'; $state = ''; }
892 else if ($state == 'ib')
893 { $output .= '</b></i>'; $state = ''; }
894 else if ($state == 'both')
895 { $output .= '<i><b>'.$buffer.'</b></i>'; $state = ''; }
896 else # ($state == '')
897 { $buffer = ''; $state = 'both'; }
902 # Now close all remaining tags. Notice that the order is important.
903 if ($state == 'b' ||
$state == 'ib')
905 if ($state == 'i' ||
$state == 'bi' ||
$state == 'ib')
909 if ($state == 'both')
910 $output .= '<b><i>'.$buffer.'</i></b>';
916 * Replace external links
918 * Note: we have to do external links before the internal ones,
919 * and otherwise take great care in the order of things here, so
920 * that we don't end up interpreting some URLs twice.
924 function replaceExternalLinks( $text ) {
925 $fname = 'Parser::replaceExternalLinks';
926 wfProfileIn( $fname );
928 $sk =& $this->mOptions
->getSkin();
929 $linktrail = wfMsgForContent('linktrail');
930 $bits = preg_split( EXT_LINK_BRACKETED
, $text, -1, PREG_SPLIT_DELIM_CAPTURE
);
932 $s = $this->replaceFreeExternalLinks( array_shift( $bits ) );
935 while ( $i<count( $bits ) ) {
937 $protocol = $bits[$i++
];
939 $trail = $bits[$i++
];
941 # If the link text is an image URL, replace it with an <img> tag
942 # This happened by accident in the original parser, but some people used it extensively
943 $img = $this->maybeMakeImageLink( $text );
944 if ( $img !== false ) {
950 # No link text, e.g. [http://domain.tld/some.link]
952 # Autonumber if allowed
953 if ( strpos( HTTP_PROTOCOLS
, $protocol ) !== false ) {
954 $text = '[' . ++
$this->mAutonumber
. ']';
956 # Otherwise just use the URL
957 $text = htmlspecialchars( $url );
960 # Have link text, e.g. [http://domain.tld/some.link text]s
962 if ( preg_match( $linktrail, $trail, $m2 ) ) {
968 $encUrl = htmlspecialchars( $url );
969 # Bit in parentheses showing the URL for the printable version
970 if( $url == $text ||
preg_match( "!$protocol://" . preg_quote( $text, '/' ) . "/?$!", $url ) ) {
973 # Expand the URL for printable version
974 if ( ! $sk->suppressUrlExpansion() ) {
975 $paren = "<span class='urlexpansion'> (<i>" . htmlspecialchars ( $encUrl ) . "</i>)</span>";
981 # Process the trail (i.e. everything after this link up until start of the next link),
982 # replacing any non-bracketed links
983 $trail = $this->replaceFreeExternalLinks( $trail );
985 # Use the encoded URL
986 # This means that users can paste URLs directly into the text
987 # Funny characters like ö aren't valid in URLs anyway
988 # This was changed in August 2004
989 $s .= $sk->makeExternalLink( $url, $text, false ) . $dtrail. $paren . $trail;
992 wfProfileOut( $fname );
997 * Replace anything that looks like a URL with a link
1000 function replaceFreeExternalLinks( $text ) {
1001 $bits = preg_split( '/((?:'.URL_PROTOCOLS
.'):)/', $text, -1, PREG_SPLIT_DELIM_CAPTURE
);
1002 $s = array_shift( $bits );
1005 $sk =& $this->mOptions
->getSkin();
1007 while ( $i < count( $bits ) ){
1008 $protocol = $bits[$i++
];
1009 $remainder = $bits[$i++
];
1011 if ( preg_match( '/^('.EXT_LINK_URL_CLASS
.'+)(.*)$/s', $remainder, $m ) ) {
1012 # Found some characters after the protocol that look promising
1013 $url = $protocol . $m[1];
1016 # Move trailing punctuation to $trail
1018 # If there is no left bracket, then consider right brackets fair game too
1019 if ( strpos( $url, '(' ) === false ) {
1023 $numSepChars = strspn( strrev( $url ), $sep );
1024 if ( $numSepChars ) {
1025 $trail = substr( $url, -$numSepChars ) . $trail;
1026 $url = substr( $url, 0, -$numSepChars );
1029 # Replace & from obsolete syntax with &
1030 $url = str_replace( '&', '&', $url );
1032 # Is this an external image?
1033 $text = $this->maybeMakeImageLink( $url );
1034 if ( $text === false ) {
1035 # Not an image, make a link
1036 $text = $sk->makeExternalLink( $url, $url );
1038 $s .= $text . $trail;
1040 $s .= $protocol . $remainder;
1047 * make an image if it's allowed
1050 function maybeMakeImageLink( $url ) {
1051 $sk =& $this->mOptions
->getSkin();
1053 if ( $this->mOptions
->getAllowExternalImages() ) {
1054 if ( preg_match( EXT_IMAGE_REGEX
, $url ) ) {
1056 $text = $sk->makeImage( htmlspecialchars( $url ) );
1063 * Process [[ ]] wikilinks
1068 function replaceInternalLinks( $s ) {
1069 global $wgLang, $wgContLang, $wgLinkCache;
1070 static $fname = 'Parser::replaceInternalLinks' ;
1071 # use a counter to prevent too much unknown links from
1072 # being checked for different language variants.
1073 static $convertCount;
1074 wfProfileIn( $fname );
1076 wfProfileIn( $fname.'-setup' );
1078 # the % is needed to support urlencoded titles as well
1079 if ( !$tc ) { $tc = Title
::legalChars() . '#%'; }
1080 $sk =& $this->mOptions
->getSkin();
1082 $redirect = MagicWord
::get ( MAG_REDIRECT
) ;
1084 #split the entire text string on occurences of [[
1085 $a = explode( '[[', ' ' . $s );
1086 #get the first element (all text up to first [[), and remove the space we added
1087 $s = array_shift( $a );
1088 $s = substr( $s, 1 );
1090 # Match a link having the form [[namespace:link|alternate]]trail
1092 if ( !$e1 ) { $e1 = "/^([{$tc}]+)(?:\\|([^]]+))?]](.*)\$/sD"; }
1093 # Match cases where there is no "]]", which might still be images
1094 static $e1_img = FALSE;
1095 if ( !$e1_img ) { $e1_img = "/^([{$tc}]+)\\|(.*)\$/sD"; }
1096 # Match the end of a line for a word that's not followed by whitespace,
1097 # e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched
1098 static $e2 = '/^(.*?)([a-zA-Z\x80-\xff]+)$/sD';
1100 $useLinkPrefixExtension = $wgContLang->linkPrefixExtension();
1102 $nottalk = !Namespace::isTalk( $this->mTitle
->getNamespace() );
1104 if ( $useLinkPrefixExtension ) {
1105 if ( preg_match( $e2, $s, $m ) ) {
1106 $first_prefix = $m[2];
1109 $first_prefix = false;
1115 wfProfileOut( $fname.'-setup' );
1117 # Loop for each link
1118 for ($k = 0; isset( $a[$k] ); $k++
) {
1120 wfProfileIn( $fname.'-prefixhandling' );
1121 if ( $useLinkPrefixExtension ) {
1122 if ( preg_match( $e2, $s, $m ) ) {
1130 $prefix = $first_prefix;
1131 $first_prefix = false;
1134 wfProfileOut( $fname.'-prefixhandling' );
1136 $might_be_img = false;
1138 if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt
1140 # fix up urlencoded title texts
1141 if(preg_match('/%/', $m[1] )) $m[1] = urldecode($m[1]);
1143 } elseif( preg_match($e1_img, $line, $m) ) { # Invalid, but might be an image with a link in its caption
1144 $might_be_img = true;
1146 if(preg_match('/%/', $m[1] )) $m[1] = urldecode($m[1]);
1148 } else { # Invalid form; output directly
1149 $s .= $prefix . '[[' . $line ;
1153 # Don't allow internal links to pages containing
1154 # PROTO: where PROTO is a valid URL protocol; these
1155 # should be external links.
1156 if (preg_match('/((?:'.URL_PROTOCOLS
.'):)/', $m[1])) {
1157 $s .= $prefix . '[[' . $line ;
1161 # Make subpage if necessary
1162 $link = $this->maybeDoSubpageLink( $m[1], $text );
1164 $noforce = (substr($m[1], 0, 1) != ':');
1166 # Strip off leading ':'
1167 $link = substr($link, 1);
1170 $nt = Title
::newFromText( $link );
1172 $s .= $prefix . '[[' . $line;
1176 //check other language variants of the link
1177 //if the article does not exist
1179 $variants = $wgContLang->getVariants();
1181 if(sizeof($variants) > 1 && $convertCount < 200) {
1183 if($nt->getArticleID() == 0) {
1184 foreach ( $variants as $v ) {
1185 if($v == $wgContLang->getPreferredVariant())
1188 $varlink = $wgContLang->autoConvert($link, $v);
1189 $varnt = Title
::newFromText($varlink);
1190 if($varnt && $varnt->getArticleID()>0) {
1195 if($varnt && $varnt->getArticleID()>0) {
1201 $ns = $nt->getNamespace();
1202 $iw = $nt->getInterWiki();
1204 if ($might_be_img) { # if this is actually an invalid link
1205 if ($ns == NS_IMAGE
&& $noforce) { #but might be an image
1207 while (isset ($a[$k+
1]) ) {
1208 #look at the next 'line' to see if we can close it there
1209 $next_line = array_shift(array_splice( $a, $k +
1, 1) );
1210 if( preg_match("/^(.*?]].*?)]](.*)$/sD", $next_line, $m) ) {
1211 # the first ]] closes the inner link, the second the image
1213 $text .= '[[' . $m[1];
1216 } elseif( preg_match("/^.*?]].*$/sD", $next_line, $m) ) {
1217 #if there's exactly one ]] that's fine, we'll keep looking
1218 $text .= '[[' . $m[0];
1220 #if $next_line is invalid too, we need look no further
1221 $text .= '[[' . $next_line;
1226 # we couldn't find the end of this imageLink, so output it raw
1227 #but don't ignore what might be perfectly normal links in the text we've examined
1228 $text = $this->replaceInternalLinks($text);
1229 $s .= $prefix . '[[' . $link . '|' . $text;
1230 # note: no $trail, because without an end, there *is* no trail
1233 } else { #it's not an image, so output it raw
1234 $s .= $prefix . '[[' . $link . '|' . $text;
1235 # note: no $trail, because without an end, there *is* no trail
1240 $wasblank = ( '' == $text );
1241 if( $wasblank ) $text = $link;
1244 # Link not escaped by : , create the various objects
1248 if( $iw && $this->mOptions
->getInterwikiMagic() && $nottalk && $wgContLang->getLanguageName( $iw ) ) {
1249 array_push( $this->mOutput
->mLanguageLinks
, $nt->getFullText() );
1250 $tmp = $prefix . $trail ;
1251 $s .= (trim($tmp) == '')?
'': $tmp;
1255 if ( $ns == NS_IMAGE
) {
1256 # recursively parse links inside the image caption
1257 # actually, this will parse them in any other parameters, too,
1258 # but it might be hard to fix that, and it doesn't matter ATM
1259 $text = $this->replaceExternalLinks($text);
1260 $text = $this->replaceInternalLinks($text);
1262 # replace the image with a link-holder so that replaceExternalLinks() can't mess with it
1263 $s .= $prefix . $this->insertStripItem( $sk->makeImageLinkObj( $nt, $text ), $this->mStripState
) . $trail;
1264 $wgLinkCache->addImageLinkObj( $nt );
1268 if ( $ns == NS_CATEGORY
) {
1269 $t = $nt->getText() ;
1271 $wgLinkCache->suspend(); # Don't save in links/brokenlinks
1272 $pPLC=$sk->postParseLinkColour();
1273 $sk->postParseLinkColour( false );
1274 $t = $sk->makeLinkObj( $nt, $t, '', '' , $prefix );
1275 $sk->postParseLinkColour( $pPLC );
1276 $wgLinkCache->resume();
1279 if ( $this->mTitle
->getNamespace() == NS_CATEGORY
) {
1280 $sortkey = $this->mTitle
->getText();
1282 $sortkey = $this->mTitle
->getPrefixedText();
1287 $wgLinkCache->addCategoryLinkObj( $nt, $sortkey );
1288 $this->mOutput
->mCategoryLinks
[] = $t ;
1289 $s .= $prefix . $trail ;
1294 $text = $wgContLang->convert($text);
1296 if( ( $nt->getPrefixedText() === $this->mTitle
->getPrefixedText() ) &&
1297 ( strpos( $link, '#' ) === FALSE ) ) {
1298 # Self-links are handled specially; generally de-link and change to bold.
1299 $s .= $prefix . $sk->makeSelfLinkObj( $nt, $text, '', $trail );
1303 # Special and Media are pseudo-namespaces; no pages actually exist in them
1304 if( $ns == NS_MEDIA
) {
1305 $s .= $prefix . $sk->makeMediaLinkObj( $nt, $text ) . $trail;
1306 $wgLinkCache->addImageLinkObj( $nt );
1308 } elseif( $ns == NS_SPECIAL
) {
1309 $s .= $prefix . $sk->makeKnownLinkObj( $nt, $text, '', $trail );
1312 $s .= $sk->makeLinkObj( $nt, $text, '', $trail, $prefix );
1314 wfProfileOut( $fname );
1319 * Handle link to subpage if necessary
1320 * @param $target string the source of the link
1321 * @param &$text the link text, modified as necessary
1322 * @return string the full name of the link
1325 function maybeDoSubpageLink($target, &$text) {
1328 # :Foobar -- override special treatment of prefix (images, language links)
1329 # /Foobar -- convert to CurrentPage/Foobar
1330 # /Foobar/ -- convert to CurrentPage/Foobar, strip the initial / from text
1331 global $wgNamespacesWithSubpages;
1333 $fname = 'Parser::maybeDoSubpageLink';
1334 wfProfileIn( $fname );
1335 # Look at the first character
1336 if( $target{0} == '/' ) {
1337 # / at end means we don't want the slash to be shown
1338 if(substr($target,-1,1)=='/') {
1339 $target=substr($target,1,-1);
1342 $noslash=substr($target,1);
1345 # Some namespaces don't allow subpages
1346 if(!empty($wgNamespacesWithSubpages[$this->mTitle
->getNamespace()])) {
1347 # subpages allowed here
1348 $ret = $this->mTitle
->getPrefixedText(). '/' . trim($noslash);
1349 if( '' === $text ) {
1351 } # this might be changed for ugliness reasons
1353 # no subpage allowed, use standard link
1361 wfProfileOut( $fname );
1366 * Used by doBlockLevels()
1369 /* private */ function closeParagraph() {
1371 if ( '' != $this->mLastSection
) {
1372 $result = '</' . $this->mLastSection
. ">\n";
1374 $this->mInPre
= false;
1375 $this->mLastSection
= '';
1378 # getCommon() returns the length of the longest common substring
1379 # of both arguments, starting at the beginning of both.
1381 /* private */ function getCommon( $st1, $st2 ) {
1382 $fl = strlen( $st1 );
1383 $shorter = strlen( $st2 );
1384 if ( $fl < $shorter ) { $shorter = $fl; }
1386 for ( $i = 0; $i < $shorter; ++
$i ) {
1387 if ( $st1{$i} != $st2{$i} ) { break; }
1391 # These next three functions open, continue, and close the list
1392 # element appropriate to the prefix character passed into them.
1394 /* private */ function openList( $char ) {
1395 $result = $this->closeParagraph();
1397 if ( '*' == $char ) { $result .= '<ul><li>'; }
1398 else if ( '#' == $char ) { $result .= '<ol><li>'; }
1399 else if ( ':' == $char ) { $result .= '<dl><dd>'; }
1400 else if ( ';' == $char ) {
1401 $result .= '<dl><dt>';
1402 $this->mDTopen
= true;
1404 else { $result = '<!-- ERR 1 -->'; }
1409 /* private */ function nextItem( $char ) {
1410 if ( '*' == $char ||
'#' == $char ) { return '</li><li>'; }
1411 else if ( ':' == $char ||
';' == $char ) {
1413 if ( $this->mDTopen
) { $close = '</dt>'; }
1414 if ( ';' == $char ) {
1415 $this->mDTopen
= true;
1416 return $close . '<dt>';
1418 $this->mDTopen
= false;
1419 return $close . '<dd>';
1422 return '<!-- ERR 2 -->';
1425 /* private */ function closeList( $char ) {
1426 if ( '*' == $char ) { $text = '</li></ul>'; }
1427 else if ( '#' == $char ) { $text = '</li></ol>'; }
1428 else if ( ':' == $char ) {
1429 if ( $this->mDTopen
) {
1430 $this->mDTopen
= false;
1431 $text = '</dt></dl>';
1433 $text = '</dd></dl>';
1436 else { return '<!-- ERR 3 -->'; }
1442 * Make lists from lines starting with ':', '*', '#', etc.
1445 * @return string the lists rendered as HTML
1447 function doBlockLevels( $text, $linestart ) {
1448 $fname = 'Parser::doBlockLevels';
1449 wfProfileIn( $fname );
1451 # Parsing through the text line by line. The main thing
1452 # happening here is handling of block-level elements p, pre,
1453 # and making lists from lines starting with * # : etc.
1455 $textLines = explode( "\n", $text );
1457 $lastPrefix = $output = $lastLine = '';
1458 $this->mDTopen
= $inBlockElem = false;
1460 $paragraphStack = false;
1462 if ( !$linestart ) {
1463 $output .= array_shift( $textLines );
1465 foreach ( $textLines as $oLine ) {
1466 $lastPrefixLength = strlen( $lastPrefix );
1467 $preCloseMatch = preg_match('/<\\/pre/i', $oLine );
1468 $preOpenMatch = preg_match('/<pre/i', $oLine );
1469 if ( !$this->mInPre
) {
1470 # Multiple prefixes may abut each other for nested lists.
1471 $prefixLength = strspn( $oLine, '*#:;' );
1472 $pref = substr( $oLine, 0, $prefixLength );
1475 $pref2 = str_replace( ';', ':', $pref );
1476 $t = substr( $oLine, $prefixLength );
1477 $this->mInPre
= !empty($preOpenMatch);
1479 # Don't interpret any other prefixes in preformatted text
1481 $pref = $pref2 = '';
1486 if( $prefixLength && 0 == strcmp( $lastPrefix, $pref2 ) ) {
1487 # Same as the last item, so no need to deal with nesting or opening stuff
1488 $output .= $this->nextItem( substr( $pref, -1 ) );
1489 $paragraphStack = false;
1491 if ( substr( $pref, -1 ) == ';') {
1492 # The one nasty exception: definition lists work like this:
1493 # ; title : definition text
1494 # So we check for : in the remainder text to split up the
1495 # title and definition, without b0rking links.
1496 if ($this->findColonNoLinks($t, $term, $t2) !== false) {
1498 $output .= $term . $this->nextItem( ':' );
1501 } elseif( $prefixLength ||
$lastPrefixLength ) {
1502 # Either open or close a level...
1503 $commonPrefixLength = $this->getCommon( $pref, $lastPrefix );
1504 $paragraphStack = false;
1506 while( $commonPrefixLength < $lastPrefixLength ) {
1507 $output .= $this->closeList( $lastPrefix{$lastPrefixLength-1} );
1508 --$lastPrefixLength;
1510 if ( $prefixLength <= $commonPrefixLength && $commonPrefixLength > 0 ) {
1511 $output .= $this->nextItem( $pref{$commonPrefixLength-1} );
1513 while ( $prefixLength > $commonPrefixLength ) {
1514 $char = substr( $pref, $commonPrefixLength, 1 );
1515 $output .= $this->openList( $char );
1517 if ( ';' == $char ) {
1518 # FIXME: This is dupe of code above
1519 if ($this->findColonNoLinks($t, $term, $t2) !== false) {
1521 $output .= $term . $this->nextItem( ':' );
1524 ++
$commonPrefixLength;
1526 $lastPrefix = $pref2;
1528 if( 0 == $prefixLength ) {
1529 # No prefix (not in list)--go to paragraph mode
1530 $uniq_prefix = UNIQ_PREFIX
;
1531 // XXX: use a stack for nestable elements like span, table and div
1532 $openmatch = preg_match('/(<table|<blockquote|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|<p|<ul|<li|<\\/tr|<\\/td|<\\/th)/i', $t );
1533 $closematch = preg_match(
1534 '/(<\\/table|<\\/blockquote|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'.
1535 '<td|<th|<div|<\\/div|<hr|<\\/pre|<\\/p|'.$uniq_prefix.'-pre|<\\/li|<\\/ul)/i', $t );
1536 if ( $openmatch or $closematch ) {
1537 $paragraphStack = false;
1538 $output .= $this->closeParagraph();
1539 if($preOpenMatch and !$preCloseMatch) {
1540 $this->mInPre
= true;
1542 if ( $closematch ) {
1543 $inBlockElem = false;
1545 $inBlockElem = true;
1547 } else if ( !$inBlockElem && !$this->mInPre
) {
1548 if ( ' ' == $t{0} and ( $this->mLastSection
== 'pre' or trim($t) != '' ) ) {
1550 if ($this->mLastSection
!= 'pre') {
1551 $paragraphStack = false;
1552 $output .= $this->closeParagraph().'<pre>';
1553 $this->mLastSection
= 'pre';
1555 $t = substr( $t, 1 );
1558 if ( '' == trim($t) ) {
1559 if ( $paragraphStack ) {
1560 $output .= $paragraphStack.'<br />';
1561 $paragraphStack = false;
1562 $this->mLastSection
= 'p';
1564 if ($this->mLastSection
!= 'p' ) {
1565 $output .= $this->closeParagraph();
1566 $this->mLastSection
= '';
1567 $paragraphStack = '<p>';
1569 $paragraphStack = '</p><p>';
1573 if ( $paragraphStack ) {
1574 $output .= $paragraphStack;
1575 $paragraphStack = false;
1576 $this->mLastSection
= 'p';
1577 } else if ($this->mLastSection
!= 'p') {
1578 $output .= $this->closeParagraph().'<p>';
1579 $this->mLastSection
= 'p';
1585 if ($paragraphStack === false) {
1589 while ( $prefixLength ) {
1590 $output .= $this->closeList( $pref2{$prefixLength-1} );
1593 if ( '' != $this->mLastSection
) {
1594 $output .= '</' . $this->mLastSection
. '>';
1595 $this->mLastSection
= '';
1598 wfProfileOut( $fname );
1603 * Split up a string on ':', ignoring any occurences inside
1604 * <a>..</a> or <span>...</span>
1605 * @param $str string the string to split
1606 * @param &$before string set to everything before the ':'
1607 * @param &$after string set to everything after the ':'
1608 * return string the position of the ':', or false if none found
1610 function findColonNoLinks($str, &$before, &$after) {
1611 # I wonder if we should make this count all tags, not just <a>
1612 # and <span>. That would prevent us from matching a ':' that
1613 # comes in the middle of italics other such formatting....
1615 $fname = 'Parser::findColonNoLinks';
1616 wfProfileIn( $fname );
1619 $colon = strpos($str, ':', $pos);
1621 if ($colon !== false) {
1622 $before = substr($str, 0, $colon);
1623 $after = substr($str, $colon +
1);
1625 # Skip any ':' within <a> or <span> pairs
1626 $a = substr_count($before, '<a');
1627 $s = substr_count($before, '<span');
1628 $ca = substr_count($before, '</a>');
1629 $cs = substr_count($before, '</span>');
1631 if ($a <= $ca and $s <= $cs) {
1632 # Tags are balanced before ':'; ok
1637 } while ($colon !== false);
1638 wfProfileOut( $fname );
1643 * Return value of a magic variable (like PAGENAME)
1647 function getVariableValue( $index ) {
1648 global $wgContLang, $wgSitename, $wgServer;
1651 case MAG_CURRENTMONTH
:
1652 return $wgContLang->formatNum( date( 'm' ) );
1653 case MAG_CURRENTMONTHNAME
:
1654 return $wgContLang->getMonthName( date('n') );
1655 case MAG_CURRENTMONTHNAMEGEN
:
1656 return $wgContLang->getMonthNameGen( date('n') );
1657 case MAG_CURRENTDAY
:
1658 return $wgContLang->formatNum( date('j') );
1660 return $this->mTitle
->getText();
1662 return $this->mTitle
->getPartialURL();
1664 # return Namespace::getCanonicalName($this->mTitle->getNamespace());
1665 return $wgContLang->getNsText($this->mTitle
->getNamespace()); # Patch by Dori
1666 case MAG_CURRENTDAYNAME
:
1667 return $wgContLang->getWeekdayName( date('w')+
1 );
1668 case MAG_CURRENTYEAR
:
1669 return $wgContLang->formatNum( date( 'Y' ) );
1670 case MAG_CURRENTTIME
:
1671 return $wgContLang->time( wfTimestampNow(), false );
1672 case MAG_NUMBEROFARTICLES
:
1673 return $wgContLang->formatNum( wfNumberOfArticles() );
1684 * initialise the magic variables (like CURRENTMONTHNAME)
1688 function initialiseVariables() {
1689 $fname = 'Parser::initialiseVariables';
1690 wfProfileIn( $fname );
1691 global $wgVariableIDs;
1692 $this->mVariables
= array();
1693 foreach ( $wgVariableIDs as $id ) {
1694 $mw =& MagicWord
::get( $id );
1695 $mw->addToArray( $this->mVariables
, $this->getVariableValue( $id ) );
1697 wfProfileOut( $fname );
1701 * Replace magic variables, templates, and template arguments
1702 * with the appropriate text. Templates are substituted recursively,
1703 * taking care to avoid infinite loops.
1705 * Note that the substitution depends on value of $mOutputType:
1706 * OT_WIKI: only {{subst:}} templates
1707 * OT_MSG: only magic variables
1708 * OT_HTML: all templates and magic variables
1710 * @param string $tex The text to transform
1711 * @param array $args Key-value pairs representing template parameters to substitute
1714 function replaceVariables( $text, $args = array() ) {
1715 global $wgLang, $wgScript, $wgArticlePath;
1717 # Prevent too big inclusions
1718 if(strlen($text)> MAX_INCLUDE_SIZE
)
1721 $fname = 'Parser::replaceVariables';
1722 wfProfileIn( $fname );
1724 $titleChars = Title
::legalChars();
1726 # This function is called recursively. To keep track of arguments we need a stack:
1727 array_push( $this->mArgStack
, $args );
1729 # PHP global rebinding syntax is a bit weird, need to use the GLOBALS array
1730 $GLOBALS['wgCurParser'] =& $this;
1732 # Variable substitution
1733 $text = preg_replace_callback( "/{{([$titleChars]*?)}}/", 'wfVariableSubstitution', $text );
1735 if ( $this->mOutputType
== OT_HTML ||
$this->mOutputType
== OT_WIKI
) {
1736 # Argument substitution
1737 $text = preg_replace_callback( "/{{{([$titleChars]*?)}}}/", 'wfArgSubstitution', $text );
1739 # Template substitution
1740 $regex = '/(\\n|{)?{{(['.$titleChars.']*)(\\|.*?|)}}/s';
1741 $text = preg_replace_callback( $regex, 'wfBraceSubstitution', $text );
1743 array_pop( $this->mArgStack
);
1745 wfProfileOut( $fname );
1750 * Replace magic variables
1753 function variableSubstitution( $matches ) {
1754 if ( !$this->mVariables
) {
1755 $this->initialiseVariables();
1758 if ( $this->mOutputType
== OT_WIKI
) {
1759 # Do only magic variables prefixed by SUBST
1760 $mwSubst =& MagicWord
::get( MAG_SUBST
);
1761 if (!$mwSubst->matchStartAndRemove( $matches[1] ))
1763 # Note that if we don't substitute the variable below,
1764 # we don't remove the {{subst:}} magic word, in case
1765 # it is a template rather than a magic variable.
1767 if ( !$skip && array_key_exists( $matches[1], $this->mVariables
) ) {
1768 $text = $this->mVariables
[$matches[1]];
1769 $this->mOutput
->mContainsOldMagic
= true;
1771 $text = $matches[0];
1776 # Split template arguments
1777 function getTemplateArgs( $argsString ) {
1778 if ( $argsString === '' ) {
1782 $args = explode( '|', substr( $argsString, 1 ) );
1784 # If any of the arguments contains a '[[' but no ']]', it needs to be
1785 # merged with the next arg because the '|' character between belongs
1786 # to the link syntax and not the template parameter syntax.
1787 $argc = count($args);
1789 for ( $i = 0; $i < $argc-1; $i++
) {
1790 if ( substr_count ( $args[$i], '[[' ) != substr_count ( $args[$i], ']]' ) ) {
1791 $args[$i] .= '|'.$args[$i+
1];
1792 array_splice($args, $i+
1, 1);
1802 * Return the text of a template, after recursively
1803 * replacing any variables or templates within the template.
1805 * @param array $matches The parts of the template
1806 * $matches[1]: the title, i.e. the part before the |
1807 * $matches[2]: the parameters (including a leading |), if any
1808 * @return string the text of the template
1811 function braceSubstitution( $matches ) {
1812 global $wgLinkCache, $wgContLang;
1813 $fname = 'Parser::braceSubstitution';
1820 # Need to know if the template comes at the start of a line,
1821 # to treat the beginning of the template like the beginning
1822 # of a line for tables and block-level elements.
1823 $linestart = $matches[1];
1825 # $part1 is the bit before the first |, and must contain only title characters
1826 # $args is a list of arguments, starting from index 0, not including $part1
1828 $part1 = $matches[2];
1829 # If the third subpattern matched anything, it will start with |
1831 $args = $this->getTemplateArgs($matches[3]);
1832 $argc = count( $args );
1834 # Don't parse {{{}}} because that's only for template arguments
1835 if ( $linestart === '{' ) {
1836 $text = $matches[0];
1843 $mwSubst =& MagicWord
::get( MAG_SUBST
);
1844 if ( $mwSubst->matchStartAndRemove( $part1 ) xor ($this->mOutputType
== OT_WIKI
) ) {
1845 # One of two possibilities is true:
1846 # 1) Found SUBST but not in the PST phase
1847 # 2) Didn't find SUBST and in the PST phase
1848 # In either case, return without further processing
1849 $text = $matches[0];
1855 # MSG, MSGNW and INT
1858 $mwMsgnw =& MagicWord
::get( MAG_MSGNW
);
1859 if ( $mwMsgnw->matchStartAndRemove( $part1 ) ) {
1862 # Remove obsolete MSG:
1863 $mwMsg =& MagicWord
::get( MAG_MSG
);
1864 $mwMsg->matchStartAndRemove( $part1 );
1867 # Check if it is an internal message
1868 $mwInt =& MagicWord
::get( MAG_INT
);
1869 if ( $mwInt->matchStartAndRemove( $part1 ) ) {
1870 if ( $this->incrementIncludeCount( 'int:'.$part1 ) ) {
1871 $text = $linestart . wfMsgReal( $part1, $args, true );
1879 # Check for NS: (namespace expansion)
1880 $mwNs = MagicWord
::get( MAG_NS
);
1881 if ( $mwNs->matchStartAndRemove( $part1 ) ) {
1882 if ( intval( $part1 ) ) {
1883 $text = $linestart . $wgContLang->getNsText( intval( $part1 ) );
1886 $index = Namespace::getCanonicalIndex( strtolower( $part1 ) );
1887 if ( !is_null( $index ) ) {
1888 $text = $linestart . $wgContLang->getNsText( $index );
1895 # LOCALURL and LOCALURLE
1897 $mwLocal = MagicWord
::get( MAG_LOCALURL
);
1898 $mwLocalE = MagicWord
::get( MAG_LOCALURLE
);
1900 if ( $mwLocal->matchStartAndRemove( $part1 ) ) {
1901 $func = 'getLocalURL';
1902 } elseif ( $mwLocalE->matchStartAndRemove( $part1 ) ) {
1903 $func = 'escapeLocalURL';
1908 if ( $func !== '' ) {
1909 $title = Title
::newFromText( $part1 );
1910 if ( !is_null( $title ) ) {
1912 $text = $linestart . $title->$func( $args[0] );
1914 $text = $linestart . $title->$func();
1922 if ( !$found && $argc == 1 ) {
1923 $mwGrammar =& MagicWord
::get( MAG_GRAMMAR
);
1924 if ( $mwGrammar->matchStartAndRemove( $part1 ) ) {
1925 $text = $linestart . $wgContLang->convertGrammar( $args[0], $part1 );
1930 # Template table test
1932 # Did we encounter this template already? If yes, it is in the cache
1933 # and we need to check for loops.
1934 if ( !$found && isset( $this->mTemplates
[$part1] ) ) {
1935 # set $text to cached message.
1936 $text = $linestart . $this->mTemplates
[$part1];
1939 # Infinite loop test
1940 if ( isset( $this->mTemplatePath
[$part1] ) ) {
1943 $text .= '<!-- WARNING: template loop detected -->';
1947 # Load from database
1948 $itcamefromthedatabase = false;
1951 $part1 = $this->maybeDoSubpageLink( $part1, $subpage='' );
1952 if ($subpage !== '') {
1953 $ns = $this->mTitle
->getNamespace();
1955 $title = Title
::newFromText( $part1, $ns );
1956 if ( !is_null( $title ) && !$title->isExternal() ) {
1957 # Check for excessive inclusion
1958 $dbk = $title->getPrefixedDBkey();
1959 if ( $this->incrementIncludeCount( $dbk ) ) {
1960 # This should never be reached.
1961 $article = new Article( $title );
1962 $articleContent = $article->getContentWithoutUsingSoManyDamnGlobals();
1963 if ( $articleContent !== false ) {
1965 $text = $linestart . $articleContent;
1966 $itcamefromthedatabase = true;
1970 # If the title is valid but undisplayable, make a link to it
1971 if ( $this->mOutputType
== OT_HTML
&& !$found ) {
1972 $text = $linestart . '[['.$title->getPrefixedText().']]';
1976 # Template cache array insertion
1977 $this->mTemplates
[$part1] = $text;
1981 # Recursive parsing, escaping and link table handling
1982 # Only for HTML output
1983 if ( $nowiki && $found && $this->mOutputType
== OT_HTML
) {
1984 $text = wfEscapeWikiText( $text );
1985 } elseif ( ($this->mOutputType
== OT_HTML ||
$this->mOutputType
== OT_WIKI
) && $found && !$noparse) {
1986 # Clean up argument array
1987 $assocArgs = array();
1989 foreach( $args as $arg ) {
1990 $eqpos = strpos( $arg, '=' );
1991 if ( $eqpos === false ) {
1992 $assocArgs[$index++
] = $arg;
1994 $name = trim( substr( $arg, 0, $eqpos ) );
1995 $value = trim( substr( $arg, $eqpos+
1 ) );
1996 if ( $value === false ) {
1999 if ( $name !== false ) {
2000 $assocArgs[$name] = $value;
2005 # Add a new element to the templace recursion path
2006 $this->mTemplatePath
[$part1] = 1;
2008 $text = $this->strip( $text, $this->mStripState
);
2009 $text = $this->removeHTMLtags( $text );
2010 $text = $this->replaceVariables( $text, $assocArgs );
2012 # Resume the link cache and register the inclusion as a link
2013 if ( $this->mOutputType
== OT_HTML
&& !is_null( $title ) ) {
2014 $wgLinkCache->addLinkObj( $title );
2017 # If the template begins with a table or block-level
2018 # element, it should be treated as beginning a new line.
2019 if ($linestart !== '\n' && preg_match('/^({\\||:|;|#|\*)/', $text)) {
2020 $text = "\n" . $text;
2024 # Empties the template path
2025 $this->mTemplatePath
= array();
2029 # replace ==section headers==
2030 # XXX this needs to go away once we have a better parser.
2031 if ( $this->mOutputType
!= OT_WIKI
&& $itcamefromthedatabase ) {
2032 if( !is_null( $title ) )
2033 $encodedname = base64_encode($title->getPrefixedDBkey());
2035 $encodedname = base64_encode("");
2036 $m = preg_split('/(^={1,6}.*?={1,6}\s*?$)/m', $text, -1,
2037 PREG_SPLIT_DELIM_CAPTURE
);
2040 for( $i = 0; $i < count($m); $i +
= 2 ) {
2042 if (!isset($m[$i +
1]) ||
$m[$i +
1] == "") continue;
2044 if( strstr($hl, "<!--MWTEMPLATESECTION") ) {
2048 preg_match('/^(={1,6})(.*?)(={1,6})\s*?$/m', $hl, $m2);
2049 $text .= $m2[1] . $m2[2] . "<!--MWTEMPLATESECTION="
2050 . $encodedname . "&" . base64_encode("$nsec") . "-->" . $m2[3];
2057 # Empties the template path
2058 $this->mTemplatePath
= array();
2067 * Triple brace replacement -- used for template arguments
2070 function argSubstitution( $matches ) {
2071 $arg = trim( $matches[1] );
2072 $text = $matches[0];
2073 $inputArgs = end( $this->mArgStack
);
2075 if ( array_key_exists( $arg, $inputArgs ) ) {
2076 $text = $inputArgs[$arg];
2083 * Returns true if the function is allowed to include this entity
2086 function incrementIncludeCount( $dbk ) {
2087 if ( !array_key_exists( $dbk, $this->mIncludeCount
) ) {
2088 $this->mIncludeCount
[$dbk] = 0;
2090 if ( ++
$this->mIncludeCount
[$dbk] <= MAX_INCLUDE_REPEAT
) {
2099 * Cleans up HTML, removes dangerous tags and attributes, and
2100 * removes HTML comments
2103 function removeHTMLtags( $text ) {
2104 global $wgUseTidy, $wgUserHtml;
2105 $fname = 'Parser::removeHTMLtags';
2106 wfProfileIn( $fname );
2109 $htmlpairs = array( # Tags that must be closed
2110 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
2111 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
2112 'strike', 'strong', 'tt', 'var', 'div', 'center',
2113 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
2114 'ruby', 'rt' , 'rb' , 'rp', 'p'
2116 $htmlsingle = array(
2117 'br', 'hr', 'li', 'dt', 'dd'
2119 $htmlnest = array( # Tags that can be nested--??
2120 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
2121 'dl', 'font', 'big', 'small', 'sub', 'sup'
2123 $tabletags = array( # Can only appear inside table
2127 $htmlpairs = array();
2128 $htmlsingle = array();
2129 $htmlnest = array();
2130 $tabletags = array();
2133 $htmlsingle = array_merge( $tabletags, $htmlsingle );
2134 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
2136 $htmlattrs = $this->getHTMLattrs () ;
2138 # Remove HTML comments
2139 $text = $this->removeHTMLcomments( $text );
2141 $bits = explode( '<', $text );
2142 $text = array_shift( $bits );
2144 $tagstack = array(); $tablestack = array();
2145 foreach ( $bits as $x ) {
2146 $prev = error_reporting( E_ALL
& ~
( E_NOTICE | E_WARNING
) );
2147 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
2149 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
2150 error_reporting( $prev );
2153 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
2157 if ( ! in_array( $t, $htmlsingle ) &&
2158 ( $ot = @array_pop
( $tagstack ) ) != $t ) {
2159 @array_push
( $tagstack, $ot );
2162 if ( $t == 'table' ) {
2163 $tagstack = array_pop( $tablestack );
2168 # Keep track for later
2169 if ( in_array( $t, $tabletags ) &&
2170 ! in_array( 'table', $tagstack ) ) {
2172 } else if ( in_array( $t, $tagstack ) &&
2173 ! in_array ( $t , $htmlnest ) ) {
2175 } else if ( ! in_array( $t, $htmlsingle ) ) {
2176 if ( $t == 'table' ) {
2177 array_push( $tablestack, $tagstack );
2178 $tagstack = array();
2180 array_push( $tagstack, $t );
2182 # Strip non-approved attributes from the tag
2183 $newparams = $this->fixTagAttributes($params);
2187 $rest = str_replace( '>', '>', $rest );
2188 $text .= "<$slash$t $newparams$brace$rest";
2192 $text .= '<' . str_replace( '>', '>', $x);
2194 # Close off any remaining tags
2195 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
2197 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
2200 # this might be possible using tidy itself
2201 foreach ( $bits as $x ) {
2202 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
2204 @list
( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
2205 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
2206 $newparams = $this->fixTagAttributes($params);
2207 $rest = str_replace( '>', '>', $rest );
2208 $text .= "<$slash$t $newparams$brace$rest";
2210 $text .= '<' . str_replace( '>', '>', $x);
2214 wfProfileOut( $fname );
2219 * Remove '<!--', '-->', and everything between.
2220 * To avoid leaving blank lines, when a comment is both preceded
2221 * and followed by a newline (ignoring spaces), trim leading and
2222 * trailing spaces and one of the newlines.
2226 function removeHTMLcomments( $text ) {
2227 $fname='Parser::removeHTMLcomments';
2228 wfProfileIn( $fname );
2229 while (($start = strpos($text, '<!--')) !== false) {
2230 $end = strpos($text, '-->', $start +
4);
2231 if ($end === false) {
2232 # Unterminated comment; bail out
2238 # Trim space and newline if the comment is both
2239 # preceded and followed by a newline
2240 $spaceStart = max($start - 1, 0);
2241 $spaceLen = $end - $spaceStart;
2242 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
2246 while (substr($text, $spaceStart +
$spaceLen, 1) === ' ')
2248 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart +
$spaceLen, 1) === "\n") {
2249 # Remove the comment, leading and trailing
2250 # spaces, and leave only one newline.
2251 $text = substr_replace($text, "\n", $spaceStart, $spaceLen +
1);
2254 # Remove just the comment.
2255 $text = substr_replace($text, '', $start, $end - $start);
2258 wfProfileOut( $fname );
2263 * This function accomplishes several tasks:
2264 * 1) Auto-number headings if that option is enabled
2265 * 2) Add an [edit] link to sections for logged in users who have enabled the option
2266 * 3) Add a Table of contents on the top for users who have enabled the option
2267 * 4) Auto-anchor headings
2269 * It loops through all headlines, collects the necessary data, then splits up the
2270 * string and re-inserts the newly formatted headlines.
2273 /* private */ function formatHeadings( $text, $isMain=true ) {
2274 global $wgInputEncoding, $wgMaxTocLevel, $wgContLang, $wgLinkHolders;
2276 $doNumberHeadings = $this->mOptions
->getNumberHeadings();
2277 $doShowToc = $this->mOptions
->getShowToc();
2278 $forceTocHere = false;
2279 if( !$this->mTitle
->userCanEdit() ) {
2281 $rightClickHack = 0;
2283 $showEditLink = $this->mOptions
->getEditSection();
2284 $rightClickHack = $this->mOptions
->getEditSectionOnRightClick();
2287 # Inhibit editsection links if requested in the page
2288 $esw =& MagicWord
::get( MAG_NOEDITSECTION
);
2289 if( $esw->matchAndRemove( $text ) ) {
2292 # if the string __NOTOC__ (not case-sensitive) occurs in the HTML,
2294 $mw =& MagicWord
::get( MAG_NOTOC
);
2295 if( $mw->matchAndRemove( $text ) ) {
2299 # never add the TOC to the Main Page. This is an entry page that should not
2300 # be more than 1-2 screens large anyway
2301 if( $this->mTitle
->getPrefixedText() == wfMsg('mainpage') ) {
2305 # Get all headlines for numbering them and adding funky stuff like [edit]
2306 # links - this is for later, but we need the number of headlines right now
2307 $numMatches = preg_match_all( '/<H([1-6])(.*?' . '>)(.*?)<\/H[1-6]>/i', $text, $matches );
2309 # if there are fewer than 4 headlines in the article, do not show TOC
2310 if( $numMatches < 4 ) {
2314 # if the string __TOC__ (not case-sensitive) occurs in the HTML,
2315 # override above conditions and always show TOC at that place
2316 $mw =& MagicWord
::get( MAG_TOC
);
2317 if ($mw->match( $text ) ) {
2319 $forceTocHere = true;
2321 # if the string __FORCETOC__ (not case-sensitive) occurs in the HTML,
2322 # override above conditions and always show TOC above first header
2323 $mw =& MagicWord
::get( MAG_FORCETOC
);
2324 if ($mw->matchAndRemove( $text ) ) {
2331 # We need this to perform operations on the HTML
2332 $sk =& $this->mOptions
->getSkin();
2336 $sectionCount = 0; # headlineCount excluding template sections
2338 # Ugh .. the TOC should have neat indentation levels which can be
2339 # passed to the skin functions. These are determined here
2344 $sublevelCount = array();
2347 foreach( $matches[3] as $headline ) {
2349 $templatetitle = "";
2350 $templatesection = 0;
2352 if (preg_match("/<!--MWTEMPLATESECTION=([^&]+)&([^_]+)-->/", $headline, $mat)) {
2354 $templatetitle = base64_decode($mat[1]);
2355 $templatesection = 1 +
(int)base64_decode($mat[2]);
2356 $headline = preg_replace("/<!--MWTEMPLATESECTION=([^&]+)&([^_]+)-->/", "", $headline);
2361 $prevlevel = $level;
2363 $level = $matches[1][$headlineCount];
2364 if( ( $doNumberHeadings ||
$doShowToc ) && $prevlevel && $level > $prevlevel ) {
2365 # reset when we enter a new level
2366 $sublevelCount[$level] = 0;
2367 $toc .= $sk->tocIndent( $level - $prevlevel );
2368 $toclevel +
= $level - $prevlevel;
2370 if( ( $doNumberHeadings ||
$doShowToc ) && $level < $prevlevel ) {
2371 # reset when we step back a level
2372 $sublevelCount[$level+
1]=0;
2373 $toc .= $sk->tocUnindent( $prevlevel - $level );
2374 $toclevel -= $prevlevel - $level;
2376 # count number of headlines for each level
2377 @$sublevelCount[$level]++
;
2378 if( $doNumberHeadings ||
$doShowToc ) {
2380 for( $i = 1; $i <= $level; $i++
) {
2381 if( !empty( $sublevelCount[$i] ) ) {
2385 $numbering .= $wgContLang->formatNum( $sublevelCount[$i] );
2391 # The canonized header is a version of the header text safe to use for links
2392 # Avoid insertion of weird stuff like <math> by expanding the relevant sections
2393 $canonized_headline = $this->unstrip( $headline, $this->mStripState
);
2394 $canonized_headline = $this->unstripNoWiki( $headline, $this->mStripState
);
2396 # Remove link placeholders by the link text.
2397 # <!--LINK number-->
2399 # link text with suffix
2400 $canonized_headline = preg_replace( '/<!--LINK ([0-9]*)-->/e',
2401 "\$wgLinkHolders['texts'][\$1]",
2402 $canonized_headline );
2405 $canonized_headline = preg_replace( '/<.*?' . '>/','',$canonized_headline );
2406 $tocline = trim( $canonized_headline );
2407 $canonized_headline = urlencode( do_html_entity_decode( str_replace(' ', '_', $tocline), ENT_COMPAT
, $wgInputEncoding ) );
2408 $replacearray = array(
2412 $canonized_headline = str_replace(array_keys($replacearray),array_values($replacearray),$canonized_headline);
2413 $refer[$headlineCount] = $canonized_headline;
2415 # count how many in assoc. array so we can track dupes in anchors
2416 @$refers[$canonized_headline]++
;
2417 $refcount[$headlineCount]=$refers[$canonized_headline];
2419 # Prepend the number to the heading text
2421 if( $doNumberHeadings ||
$doShowToc ) {
2422 $tocline = $numbering . ' ' . $tocline;
2424 # Don't number the heading if it is the only one (looks silly)
2425 if( $doNumberHeadings && count( $matches[3] ) > 1) {
2426 # the two are different if the line contains a link
2427 $headline=$numbering . ' ' . $headline;
2431 # Create the anchor for linking from the TOC to the section
2432 $anchor = $canonized_headline;
2433 if($refcount[$headlineCount] > 1 ) {
2434 $anchor .= '_' . $refcount[$headlineCount];
2436 if( $doShowToc && ( !isset($wgMaxTocLevel) ||
$toclevel<$wgMaxTocLevel ) ) {
2437 $toc .= $sk->tocLine($anchor,$tocline,$toclevel);
2439 if( $showEditLink && ( !$istemplate ||
$templatetitle !== "" ) ) {
2440 if ( empty( $head[$headlineCount] ) ) {
2441 $head[$headlineCount] = '';
2444 $head[$headlineCount] .= $sk->editSectionLinkForOther($templatetitle, $templatesection);
2446 $head[$headlineCount] .= $sk->editSectionLink($sectionCount+
1);
2449 # Add the edit section span
2450 if( $rightClickHack ) {
2452 $headline = $sk->editSectionScriptForOther($templatetitle, $templatesection, $headline);
2454 $headline = $sk->editSectionScript($sectionCount+
1,$headline);
2457 # give headline the correct <h#> tag
2458 @$head[$headlineCount] .= "<a name=\"$anchor\"></a><h".$level.$matches[2][$headlineCount] .$headline.'</h'.$level.'>';
2466 $toclines = $headlineCount;
2467 $toc .= $sk->tocUnindent( $toclevel );
2468 $toc = $sk->tocTable( $toc );
2471 # split up and insert constructed headlines
2473 $blocks = preg_split( '/<H[1-6].*?' . '>.*?<\/H[1-6]>/i', $text );
2476 foreach( $blocks as $block ) {
2477 if( $showEditLink && $headlineCount > 0 && $i == 0 && $block != "\n" ) {
2478 # This is the [edit] link that appears for the top block of text when
2479 # section editing is enabled
2481 # Disabled because it broke block formatting
2482 # For example, a bullet point in the top line
2483 # $full .= $sk->editSectionLink(0);
2486 if( $doShowToc && !$i && $isMain && !$forceTocHere) {
2487 # Top anchor now in skin
2491 if( !empty( $head[$i] ) ) {
2497 $mw =& MagicWord
::get( MAG_TOC
);
2498 return $mw->replace( $toc, $full );
2505 * Return an HTML link for the "ISBN 123456" text
2508 function magicISBN( $text ) {
2510 $fname = 'Parser::magicISBN';
2511 wfProfileIn( $fname );
2513 $a = split( 'ISBN ', ' '.$text );
2514 if ( count ( $a ) < 2 ) {
2515 wfProfileOut( $fname );
2518 $text = substr( array_shift( $a ), 1);
2519 $valid = '0123456789-ABCDEFGHIJKLMNOPQRSTUVWXYZ';
2521 foreach ( $a as $x ) {
2522 $isbn = $blank = '' ;
2523 while ( ' ' == $x{0} ) {
2525 $x = substr( $x, 1 );
2527 if ( $x == '' ) { # blank isbn
2528 $text .= "ISBN $blank";
2531 while ( strstr( $valid, $x{0} ) != false ) {
2533 $x = substr( $x, 1 );
2535 $num = str_replace( '-', '', $isbn );
2536 $num = str_replace( ' ', '', $num );
2539 $text .= "ISBN $blank$x";
2541 $titleObj = Title
::makeTitle( NS_SPECIAL
, 'Booksources' );
2542 $text .= '<a href="' .
2543 $titleObj->escapeLocalUrl( 'isbn='.$num ) .
2544 "\" class=\"internal\">ISBN $isbn</a>";
2548 wfProfileOut( $fname );
2553 * Return an HTML link for the "GEO ..." text
2556 function magicGEO( $text ) {
2557 global $wgLang, $wgUseGeoMode;
2558 $fname = 'Parser::magicGEO';
2559 wfProfileIn( $fname );
2561 # These next five lines are only for the ~35000 U.S. Census Rambot pages...
2562 $directions = array ( 'N' => 'North' , 'S' => 'South' , 'E' => 'East' , 'W' => 'West' ) ;
2563 $text = preg_replace ( "/(\d+)°(\d+)'(\d+)\" {$directions['N']}, (\d+)°(\d+)'(\d+)\" {$directions['W']}/" , "(GEO +\$1.\$2.\$3:-\$4.\$5.\$6)" , $text ) ;
2564 $text = preg_replace ( "/(\d+)°(\d+)'(\d+)\" {$directions['N']}, (\d+)°(\d+)'(\d+)\" {$directions['E']}/" , "(GEO +\$1.\$2.\$3:+\$4.\$5.\$6)" , $text ) ;
2565 $text = preg_replace ( "/(\d+)°(\d+)'(\d+)\" {$directions['S']}, (\d+)°(\d+)'(\d+)\" {$directions['W']}/" , "(GEO +\$1.\$2.\$3:-\$4.\$5.\$6)" , $text ) ;
2566 $text = preg_replace ( "/(\d+)°(\d+)'(\d+)\" {$directions['S']}, (\d+)°(\d+)'(\d+)\" {$directions['E']}/" , "(GEO +\$1.\$2.\$3:+\$4.\$5.\$6)" , $text ) ;
2568 $a = split( 'GEO ', ' '.$text );
2569 if ( count ( $a ) < 2 ) {
2570 wfProfileOut( $fname );
2573 $text = substr( array_shift( $a ), 1);
2574 $valid = '0123456789.+-:';
2576 foreach ( $a as $x ) {
2577 $geo = $blank = '' ;
2578 while ( ' ' == $x{0} ) {
2580 $x = substr( $x, 1 );
2582 while ( strstr( $valid, $x{0} ) != false ) {
2584 $x = substr( $x, 1 );
2586 $num = str_replace( '+', '', $geo );
2587 $num = str_replace( ' ', '', $num );
2589 if ( '' == $num ||
count ( explode ( ':' , $num , 3 ) ) < 2 ) {
2590 $text .= "GEO $blank$x";
2592 $titleObj = Title
::makeTitle( NS_SPECIAL
, 'Geo' );
2593 $text .= '<a href="' .
2594 $titleObj->escapeLocalUrl( 'coordinates='.$num ) .
2595 "\" class=\"internal\">GEO $geo</a>";
2599 wfProfileOut( $fname );
2604 * Return an HTML link for the "RFC 1234" text
2606 * @param string $text text to be processed
2608 function magicRFC( $text ) {
2611 $valid = '0123456789';
2614 $a = split( 'RFC ', ' '.$text );
2615 if ( count ( $a ) < 2 ) return $text;
2616 $text = substr( array_shift( $a ), 1);
2618 /* Check if RFC keyword is preceed by [[.
2619 * This test is made here cause of the array_shift above
2620 * that prevent the test to be done in the foreach.
2622 if(substr($text, -2) == '[[') { $internal = true; }
2624 foreach ( $a as $x ) {
2625 /* token might be empty if we have RFC RFC 1234 */
2631 $rfc = $blank = '' ;
2633 /** remove and save whitespaces in $blank */
2634 while ( $x{0} == ' ' ) {
2636 $x = substr( $x, 1 );
2639 /** remove and save the rfc number in $rfc */
2640 while ( strstr( $valid, $x{0} ) != false ) {
2642 $x = substr( $x, 1 );
2646 /* call back stripped spaces*/
2647 $text .= "RFC $blank$x";
2648 } elseif( $internal) {
2650 $text .= "RFC $rfc$x";
2652 /* build the external link*/
2653 $url = wfmsg( 'rfcurl' );
2654 $url = str_replace( '$1', $rfc, $url);
2655 $sk =& $this->mOptions
->getSkin();
2656 $la = $sk->getExternalLinkAttributes( $url, 'RFC '.$rfc );
2657 $text .= "<a href='{$url}'{$la}>RFC {$rfc}</a>{$x}";
2660 /* Check if the next RFC keyword is preceed by [[ */
2661 $internal = (substr($x,-2) == '[[');
2667 * Transform wiki markup when saving a page by doing \r\n -> \n
2668 * conversion, substitting signatures, {{subst:}} templates, etc.
2670 * @param string $text the text to transform
2671 * @param Title &$title the Title object for the current article
2672 * @param User &$user the User object describing the current user
2673 * @param ParserOptions $options parsing options
2674 * @param bool $clearState whether to clear the parser state first
2675 * @return string the altered wiki markup
2678 function preSaveTransform( $text, &$title, &$user, $options, $clearState = true ) {
2679 $this->mOptions
= $options;
2680 $this->mTitle
=& $title;
2681 $this->mOutputType
= OT_WIKI
;
2683 if ( $clearState ) {
2684 $this->clearState();
2687 $stripState = false;
2691 $text = str_replace(array_keys($pairs), array_values($pairs), $text);
2695 "/<br.+(clear|break)=[\"']?(all|both)[\"']?\\/?>/i" => '<br style="clear:both;"/>',
2696 "/<br *?>/i" => "<br />",
2698 $text = preg_replace(array_keys($pairs), array_values($pairs), $text);
2700 $text = $this->strip( $text, $stripState, false );
2701 $text = $this->pstPass2( $text, $user );
2702 $text = $this->unstrip( $text, $stripState );
2703 $text = $this->unstripNoWiki( $text, $stripState );
2708 * Pre-save transform helper function
2711 function pstPass2( $text, &$user ) {
2712 global $wgLang, $wgContLang, $wgLocaltimezone, $wgCurParser;
2714 # Variable replacement
2715 # Because mOutputType is OT_WIKI, this will only process {{subst:xxx}} type tags
2716 $text = $this->replaceVariables( $text );
2720 $n = $user->getName();
2721 $k = $user->getOption( 'nickname' );
2722 if ( '' == $k ) { $k = $n; }
2723 if(isset($wgLocaltimezone)) {
2724 $oldtz = getenv('TZ'); putenv('TZ='.$wgLocaltimezone);
2726 /* Note: this is an ugly timezone hack for the European wikis */
2727 $d = $wgContLang->timeanddate( date( 'YmdHis' ), false ) .
2728 ' (' . date( 'T' ) . ')';
2729 if(isset($wgLocaltimezone)) putenv('TZ='.$oldtzs);
2731 $text = preg_replace( '/~~~~~/', $d, $text );
2732 $text = preg_replace( '/~~~~/', '[[' . $wgContLang->getNsText( NS_USER
) . ":$n|$k]] $d", $text );
2733 $text = preg_replace( '/~~~/', '[[' . $wgContLang->getNsText( NS_USER
) . ":$n|$k]]", $text );
2735 # Context links: [[|name]] and [[name (context)|]]
2737 $tc = "[&;%\\-,.\\(\\)' _0-9A-Za-z\\/:\\x80-\\xff]";
2738 $np = "[&;%\\-,.' _0-9A-Za-z\\/:\\x80-\\xff]"; # No parens
2739 $namespacechar = '[ _0-9A-Za-z\x80-\xff]'; # Namespaces can use non-ascii!
2740 $conpat = "/^({$np}+) \\(({$tc}+)\\)$/";
2742 $p1 = "/\[\[({$np}+) \\(({$np}+)\\)\\|]]/"; # [[page (context)|]]
2743 $p2 = "/\[\[\\|({$tc}+)]]/"; # [[|page]]
2744 $p3 = "/\[\[(:*$namespacechar+):({$np}+)\\|]]/"; # [[namespace:page|]] and [[:namespace:page|]]
2745 $p4 = "/\[\[(:*$namespacechar+):({$np}+) \\(({$np}+)\\)\\|]]/"; # [[ns:page (cont)|]] and [[:ns:page (cont)|]]
2747 $t = $this->mTitle
->getText();
2748 if ( preg_match( $conpat, $t, $m ) ) {
2751 $text = preg_replace( $p4, '[[\\1:\\2 (\\3)|\\2]]', $text );
2752 $text = preg_replace( $p1, '[[\\1 (\\2)|\\1]]', $text );
2753 $text = preg_replace( $p3, '[[\\1:\\2|\\2]]', $text );
2755 if ( '' == $context ) {
2756 $text = preg_replace( $p2, '[[\\1]]', $text );
2758 $text = preg_replace( $p2, "[[\\1 ({$context})|\\1]]", $text );
2761 # Trim trailing whitespace
2762 # MAG_END (__END__) tag allows for trailing
2763 # whitespace to be deliberately included
2764 $text = rtrim( $text );
2765 $mw =& MagicWord
::get( MAG_END
);
2766 $mw->matchAndRemove( $text );
2772 * Set up some variables which are usually set up in parse()
2773 * so that an external function can call some class members with confidence
2776 function startExternalParse( &$title, $options, $outputType, $clearState = true ) {
2777 $this->mTitle
=& $title;
2778 $this->mOptions
= $options;
2779 $this->mOutputType
= $outputType;
2780 if ( $clearState ) {
2781 $this->clearState();
2786 * Transform a MediaWiki message by replacing magic variables.
2788 * @param string $text the text to transform
2789 * @param ParserOptions $options options
2790 * @return string the text with variables substituted
2793 function transformMsg( $text, $options ) {
2795 static $executing = false;
2797 # Guard against infinite recursion
2803 $this->mTitle
= $wgTitle;
2804 $this->mOptions
= $options;
2805 $this->mOutputType
= OT_MSG
;
2806 $this->clearState();
2807 $text = $this->replaceVariables( $text );
2814 * Create an HTML-style tag, e.g. <yourtag>special text</yourtag>
2815 * Callback will be called with the text within
2816 * Transform and return the text within
2819 function setHook( $tag, $callback ) {
2820 $oldVal = @$this->mTagHooks
[$tag];
2821 $this->mTagHooks
[$tag] = $callback;
2828 * @package MediaWiki
2832 var $mText, $mLanguageLinks, $mCategoryLinks, $mContainsOldMagic;
2833 var $mCacheTime; # Used in ParserCache
2835 function ParserOutput( $text = '', $languageLinks = array(), $categoryLinks = array(),
2836 $containsOldMagic = false )
2838 $this->mText
= $text;
2839 $this->mLanguageLinks
= $languageLinks;
2840 $this->mCategoryLinks
= $categoryLinks;
2841 $this->mContainsOldMagic
= $containsOldMagic;
2842 $this->mCacheTime
= '';
2845 function getText() { return $this->mText
; }
2846 function getLanguageLinks() { return $this->mLanguageLinks
; }
2847 function getCategoryLinks() { return $this->mCategoryLinks
; }
2848 function getCacheTime() { return $this->mCacheTime
; }
2849 function containsOldMagic() { return $this->mContainsOldMagic
; }
2850 function setText( $text ) { return wfSetVar( $this->mText
, $text ); }
2851 function setLanguageLinks( $ll ) { return wfSetVar( $this->mLanguageLinks
, $ll ); }
2852 function setCategoryLinks( $cl ) { return wfSetVar( $this->mCategoryLinks
, $cl ); }
2853 function setContainsOldMagic( $com ) { return wfSetVar( $this->mContainsOldMagic
, $com ); }
2854 function setCacheTime( $t ) { return wfSetVar( $this->mCacheTime
, $t ); }
2856 function merge( $other ) {
2857 $this->mLanguageLinks
= array_merge( $this->mLanguageLinks
, $other->mLanguageLinks
);
2858 $this->mCategoryLinks
= array_merge( $this->mCategoryLinks
, $this->mLanguageLinks
);
2859 $this->mContainsOldMagic
= $this->mContainsOldMagic ||
$other->mContainsOldMagic
;
2865 * Set options of the Parser
2867 * @package MediaWiki
2871 # All variables are private
2872 var $mUseTeX; # Use texvc to expand <math> tags
2873 var $mUseDynamicDates; # Use $wgDateFormatter to format dates
2874 var $mInterwikiMagic; # Interlanguage links are removed and returned in an array
2875 var $mAllowExternalImages; # Allow external images inline
2876 var $mSkin; # Reference to the preferred skin
2877 var $mDateFormat; # Date format index
2878 var $mEditSection; # Create "edit section" links
2879 var $mEditSectionOnRightClick; # Generate JavaScript to edit section on right click
2880 var $mNumberHeadings; # Automatically number headings
2881 var $mShowToc; # Show table of contents
2883 function getUseTeX() { return $this->mUseTeX
; }
2884 function getUseDynamicDates() { return $this->mUseDynamicDates
; }
2885 function getInterwikiMagic() { return $this->mInterwikiMagic
; }
2886 function getAllowExternalImages() { return $this->mAllowExternalImages
; }
2887 function getSkin() { return $this->mSkin
; }
2888 function getDateFormat() { return $this->mDateFormat
; }
2889 function getEditSection() { return $this->mEditSection
; }
2890 function getEditSectionOnRightClick() { return $this->mEditSectionOnRightClick
; }
2891 function getNumberHeadings() { return $this->mNumberHeadings
; }
2892 function getShowToc() { return $this->mShowToc
; }
2894 function setUseTeX( $x ) { return wfSetVar( $this->mUseTeX
, $x ); }
2895 function setUseDynamicDates( $x ) { return wfSetVar( $this->mUseDynamicDates
, $x ); }
2896 function setInterwikiMagic( $x ) { return wfSetVar( $this->mInterwikiMagic
, $x ); }
2897 function setAllowExternalImages( $x ) { return wfSetVar( $this->mAllowExternalImages
, $x ); }
2898 function setDateFormat( $x ) { return wfSetVar( $this->mDateFormat
, $x ); }
2899 function setEditSection( $x ) { return wfSetVar( $this->mEditSection
, $x ); }
2900 function setEditSectionOnRightClick( $x ) { return wfSetVar( $this->mEditSectionOnRightClick
, $x ); }
2901 function setNumberHeadings( $x ) { return wfSetVar( $this->mNumberHeadings
, $x ); }
2902 function setShowToc( $x ) { return wfSetVar( $this->mShowToc
, $x ); }
2904 function setSkin( &$x ) { $this->mSkin
=& $x; }
2906 # Get parser options
2907 /* static */ function newFromUser( &$user ) {
2908 $popts = new ParserOptions
;
2909 $popts->initialiseFromUser( $user );
2914 function initialiseFromUser( &$userInput ) {
2915 global $wgUseTeX, $wgUseDynamicDates, $wgInterwikiMagic, $wgAllowExternalImages;
2917 $fname = 'ParserOptions::initialiseFromUser';
2918 wfProfileIn( $fname );
2919 if ( !$userInput ) {
2921 $user->setLoaded( true );
2923 $user =& $userInput;
2926 $this->mUseTeX
= $wgUseTeX;
2927 $this->mUseDynamicDates
= $wgUseDynamicDates;
2928 $this->mInterwikiMagic
= $wgInterwikiMagic;
2929 $this->mAllowExternalImages
= $wgAllowExternalImages;
2930 wfProfileIn( $fname.'-skin' );
2931 $this->mSkin
=& $user->getSkin();
2932 wfProfileOut( $fname.'-skin' );
2933 $this->mDateFormat
= $user->getOption( 'date' );
2934 $this->mEditSection
= $user->getOption( 'editsection' );
2935 $this->mEditSectionOnRightClick
= $user->getOption( 'editsectiononrightclick' );
2936 $this->mNumberHeadings
= $user->getOption( 'numberheadings' );
2937 $this->mShowToc
= $user->getOption( 'showtoc' );
2938 wfProfileOut( $fname );
2944 # Regex callbacks, used in Parser::replaceVariables
2945 function wfBraceSubstitution( $matches ) {
2946 global $wgCurParser;
2947 return $wgCurParser->braceSubstitution( $matches );
2950 function wfArgSubstitution( $matches ) {
2951 global $wgCurParser;
2952 return $wgCurParser->argSubstitution( $matches );
2955 function wfVariableSubstitution( $matches ) {
2956 global $wgCurParser;
2957 return $wgCurParser->variableSubstitution( $matches );
2961 * Return the total number of articles
2963 function wfNumberOfArticles() {
2964 global $wgNumberOfArticles;
2967 return $wgNumberOfArticles;
2971 * Get various statistics from the database
2974 function wfLoadSiteStats() {
2975 global $wgNumberOfArticles, $wgTotalViews, $wgTotalEdits;
2976 $fname = 'wfLoadSiteStats';
2978 if ( -1 != $wgNumberOfArticles ) return;
2979 $dbr =& wfGetDB( DB_SLAVE
);
2980 $s = $dbr->getArray( 'site_stats',
2981 array( 'ss_total_views', 'ss_total_edits', 'ss_good_articles' ),
2982 array( 'ss_row_id' => 1 ), $fname
2985 if ( $s === false ) {
2988 $wgTotalViews = $s->ss_total_views
;
2989 $wgTotalEdits = $s->ss_total_edits
;
2990 $wgNumberOfArticles = $s->ss_good_articles
;
2994 function wfEscapeHTMLTagsOnly( $in ) {
2996 array( '"', '>', '<' ),
2997 array( '"', '>', '<' ),