From 6d41132f82ddb2c118780a510d392d1b90c8d7c9 Mon Sep 17 00:00:00 2001 From: Gabriel Wicke Date: Wed, 26 May 2004 16:29:04 +0000 Subject: [PATCH] temporary removal of tokenizer for performance To be done: * integrate timelines into strip() / unstrip() * 'blank' token handling (french spaces, maybe numbers) --- includes/Parser.php | 589 ++++++++++++++------------------------------ 1 file changed, 185 insertions(+), 404 deletions(-) diff --git a/includes/Parser.php b/includes/Parser.php index 3a4b666a1d..6b8c27502d 100644 --- a/includes/Parser.php +++ b/includes/Parser.php @@ -1,6 +1,6 @@ removeHTMLtags( $text ); $text = $this->replaceVariables( $text, $args ); - # $text = preg_replace( "/(^|\n)-----*/", "\\1
", $text ); + $text = preg_replace( "/(^|\n)-----*/", "\\1
", $text ); $text = $this->doHeadings( $text ); if($this->mOptions->getUseDynamicDates()) { global $wgDateFormatter; $text = $wgDateFormatter->reformat( $this->mOptions->getDateFormat(), $text ); } + $text = $this->doAllQuotes( $text ); $text = $this->replaceExternalLinks( $text ); - $text = $this->doTokenizedParser ( $text ); + $text = $this->replaceInternalLinks ( $text ); + //$text = $this->doTokenizedParser ( $text ); $text = $this->doTableStuff ( $text ) ; + $text = $this->magicISBN( $text ); + $text = $this->magicRFC( $text ); $text = $this->formatHeadings( $text, $isMain ); $sk =& $this->mOptions->getSkin(); $text = $sk->transformContent( $text ); @@ -590,6 +594,64 @@ class Parser return $text; } + /* private */ function doAllQuotes( $text ) + { + $outtext = ""; + $lines = explode( "\r\n", $text ); + foreach ( $lines as $line ) { + $outtext .= $this->doQuotes ( "", $line, "" ) . "\r\n"; + } + return $outtext; + } + + /* private */ function doQuotes( $pre, $text, $mode ) + { + if ( preg_match( "/^(.*)''(.*)$/sU", $text, $m ) ) { + $m1_strong = ($m[1] == "") ? "" : "{$m[1]}"; + $m1_em = ($m[1] == "") ? "" : "{$m[1]}"; + if ( substr ($m[2], 0, 1) == "'" ) { + $m[2] = substr ($m[2], 1); + if ($mode == "em") { + return $this->doQuotes ( $m[1], $m[2], ($m[1] == "") ? "both" : "emstrong" ); + } else if ($mode == "strong") { + return $m1_strong . $this->doQuotes ( "", $m[2], "" ); + } else if (($mode == "emstrong") || ($mode == "both")) { + return $this->doQuotes ( "", $pre.$m1_strong.$m[2], "em" ); + } else if ($mode == "strongem") { + return "{$pre}{$m1_em}" . $this->doQuotes ( "", $m[2], "em" ); + } else { + return $m[1] . $this->doQuotes ( "", $m[2], "strong" ); + } + } else { + if ($mode == "strong") { + return $this->doQuotes ( $m[1], $m[2], ($m[1] == "") ? "both" : "strongem" ); + } else if ($mode == "em") { + return $m1_em . $this->doQuotes ( "", $m[2], "" ); + } else if ($mode == "emstrong") { + return "{$pre}{$m1_strong}" . $this->doQuotes ( "", $m[2], "strong" ); + } else if (($mode == "strongem") || ($mode == "both")) { + return $this->doQuotes ( "", $pre.$m1_em.$m[2], "strong" ); + } else { + return $m[1] . $this->doQuotes ( "", $m[2], "em" ); + } + } + } else { + $text_strong = ($text == "") ? "" : "{$text}"; + $text_em = ($text == "") ? "" : "{$text}"; + if ($mode == "") { + return $pre . $text; + } else if ($mode == "em") { + return $pre . $text_em; + } else if ($mode == "strong") { + return $pre . $text_strong; + } else if ($mode == "strongem") { + return (($pre == "") && ($text == "")) ? "" : "{$pre}{$text_em}"; + } else { + return (($pre == "") && ($text == "")) ? "" : "{$pre}{$text_strong}"; + } + } + } + # Note: we have to do external links before the internal ones, # and otherwise take great care in the order of things here, so # that we don't end up interpreting some URLs twice. @@ -676,284 +738,12 @@ class Parser return $s; } - /* private */ function handle4Quotes( &$state, $token ) - { - /* This one makes some assumptions. - * '''Caesar''''s army => Caesar's army - * ''''Caesar'''' was a roman emperor => 'Caesar' was a roman emperor - * These assumptions might be wrong, but any other assumption might be wrong, too. - * So here we go */ - if ( $state["strong"] !== false ) { - return $this->handle3Quotes( $state, $token ) . "'"; - } else { - return "'" . $this->handle3Quotes( $state, $token ); - } - } - - - /* private */ function handle3Quotes( &$state, $token ) - { - if ( $state["strong"] !== false ) { - if ( $state["em"] !== false && $state["em"] > $state["strong"] ) - { - # ''' lala ''lala ''' - $s = ""; - } else { - $s = ""; - } - $state["strong"] = FALSE; - } else { - $s = ""; - $state["strong"] = $token["pos"]; - } - return $s; - } - /* private */ function handle2Quotes( &$state, $token ) - { - if ( $state["em"] !== false ) { - if ( $state["strong"] !== false && $state["strong"] > $state["em"] ) - { - # ''lala'''lala'' ....''' - $s = ""; - } else { - $s = ""; - } - $state["em"] = FALSE; - } else { - $s = ""; - $state["em"] = $token["pos"]; - - } - return $s; - } - - /* private */ function handle5Quotes( &$state, $token ) - { - $s = ""; - if ( $state["em"] !== false && $state["strong"] !== false ) { - if ( $state["em"] < $state["strong"] ) { - $s .= ""; - } else { - $s .= ""; - } - $state["strong"] = $state["em"] = FALSE; - } elseif ( $state["em"] !== false ) { - $s .= ""; - $state["em"] = FALSE; - $state["strong"] = $token["pos"]; - } elseif ( $state["strong"] !== false ) { - $s .= ""; - $state["strong"] = FALSE; - $state["em"] = $token["pos"]; - } else { # not $em and not $strong - $s .= ""; - $state["strong"] = $state["em"] = $token["pos"]; - } - return $s; - } - - /* private */ function doTokenizedParser( $str ) - { - global $wgLang; # for language specific parser hook - global $wgUploadDirectory, $wgUseTimeline; - - $tokenizer=Tokenizer::newFromString( $str ); - $tokenStack = array(); - - $s=""; - $state["em"] = FALSE; - $state["strong"] = FALSE; - $tagIsOpen = FALSE; - $threeopen = false; - - # The tokenizer splits the text into tokens and returns them one by one. - # Every call to the tokenizer returns a new token. - while ( $token = $tokenizer->nextToken() ) - { - switch ( $token["type"] ) - { - case "text": - # simple text with no further markup - $txt = $token["text"]; - break; - case "blank": - # Text that contains blanks that have to be converted to - # non-breakable spaces for French. - # U+202F NARROW NO-BREAK SPACE might be a better choice, but - # browser support for Unicode spacing is poor. - $txt = str_replace( " ", " ", $token["text"] ); - break; - case "[[[": - # remember the tag opened with 3 [ - $threeopen = true; - case "[[": - # link opening tag. - # FIXME : Treat orphaned open tags (stack not empty when text is over) - $tagIsOpen = TRUE; - array_push( $tokenStack, $token ); - $txt=""; - break; - - case "]]]": - case "]]": - # link close tag. - # get text from stack, glue it together, and call the code to handle a - # link - - if ( count( $tokenStack ) == 0 ) - { - # stack empty. Found a ]] without an opening [[ - $txt = "]]"; - } else { - $linkText = ""; - $lastToken = array_pop( $tokenStack ); - while ( !(($lastToken["type"] == "[[[") or ($lastToken["type"] == "[[")) ) - { - if( !empty( $lastToken["text"] ) ) { - $linkText = $lastToken["text"] . $linkText; - } - $lastToken = array_pop( $tokenStack ); - } - - $txt = $linkText ."]]"; - - if( isset( $lastToken["text"] ) ) { - $prefix = $lastToken["text"]; - } else { - $prefix = ""; - } - $nextToken = $tokenizer->previewToken(); - if ( $nextToken["type"] == "text" ) - { - # Preview just looks at it. Now we have to fetch it. - $nextToken = $tokenizer->nextToken(); - $txt .= $nextToken["text"]; - } - $txt = $this->handleInternalLink( $this->unstrip($txt,$this->mStripState), $prefix ); - - # did the tag start with 3 [ ? - if($threeopen) { - # show the first as text - $txt = "[".$txt; - $threeopen=false; - } - - } - $tagIsOpen = (count( $tokenStack ) != 0); - break; - case "----": - $txt = "\n
\n"; - break; - case "'''": - # This and the four next ones handle quotes - $txt = $this->handle3Quotes( $state, $token ); - break; - case "''": - $txt = $this->handle2Quotes( $state, $token ); - break; - case "'''''": - $txt = $this->handle5Quotes( $state, $token ); - break; - case "''''": - $txt = $this->handle4Quotes( $state, $token ); - break; - case "": - # empty token - $txt=""; - break; - case "h": - #heading- used to close all unbalanced bold or em tags in this section - $txt = ''; - if( $state['em'] !== false and - ( $state['strong'] === false or $state['em'] > $state['strong'] ) ) - { - $s .= '
'; - $state['em'] = false; - } - if ( $state['strong'] !== false ) $txt .= '
'; - if ( $state['em'] !== false ) $txt .= '
'; - $state['strong'] = $state['em'] = false; - break; - case "RFC ": - if ( $tagIsOpen ) { - $txt = "RFC "; - } else { - $txt = $this->doMagicRFC( $tokenizer ); - } - break; - case "ISBN ": - if ( $tagIsOpen ) { - $txt = "ISBN "; - } else { - $txt = $this->doMagicISBN( $tokenizer ); - } - break; - case "": - if ( $wgUseTimeline && - "" != ( $timelinesrc = $tokenizer->readAllUntil("</timeline>") ) ) - { - $txt = renderTimeline( $timelinesrc ); - } else { - $txt=$token["text"]; - } - break; - default: - # Call language specific Hook. - $txt = $wgLang->processToken( $token, $tokenStack ); - if ( NULL == $txt ) { - # An unkown token. Highlight. - $txt = "".$token["type"].""; - $txt .= "".$token["text"].""; - } - break; - } - # If we're parsing the interior of a link, don't append the interior to $s, - # but push it to the stack so it can be processed when a ]] token is found. - if ( $tagIsOpen && $txt != "" ) { - $token["type"] = "text"; - $token["text"] = $txt; - array_push( $tokenStack, $token ); - } else { - $s .= $txt; - } - } #end while - - # make 100% sure all strong and em tags are closed - # doBlockLevels often messes the last bit up though, but invalid nesting is better than unclosed tags - # tidy solves this though - if( $state['em'] !== false and - ( $state['strong'] === false or $state['em'] > $state['strong'] ) ) - { - $s .= ''; - $state['em'] = false; - } - if ( $state['strong'] !== false ) $s .= ''; - if ( $state['em'] !== false ) $s .= ''; - - if ( count( $tokenStack ) != 0 ) - { - # still objects on stack. opened [[ tag without closing ]] tag. - $txt = ""; - while ( $lastToken = array_pop( $tokenStack ) ) - { - if ( $lastToken["type"] == "text" ) - { - $txt = $lastToken["text"] . $txt; - } else { - $txt = $lastToken["type"] . $txt; - } - } - $s .= $txt; - } - return $s; - } - - /* private */ function handleInternalLink( $line, $prefix ) + /* private */ function replaceInternalLinks( $s ) { global $wgLang, $wgLinkCache; global $wgNamespacesWithSubpages, $wgLanguageCode; - static $fname = "Parser::handleInternalLink" ; + static $fname = "Parser::replaceInternalLink" ; wfProfileIn( $fname ); wfProfileIn( "$fname-setup" ); @@ -962,6 +752,10 @@ class Parser if ( !$tc ) { $tc = Title::legalChars() . "#%"; } $sk =& $this->mOptions->getSkin(); + $a = explode( "[[", " " . $s ); + $s = array_shift( $a ); + $s = substr( $s, 1 ); + # Match a link having the form [[namespace:link|alternate]]trail static $e1 = FALSE; if ( !$e1 ) { $e1 = "/^([{$tc}]+)(?:\\|([^]]+))?]](.*)\$/sD"; } @@ -984,109 +778,118 @@ class Parser $nottalk = !Namespace::isTalk( $this->mTitle->getNamespace() ); + if ( $wgLang->linkPrefixExtension() && preg_match( $e2, $s, $m ) ) { + $new_prefix = $m[2]; + $s = $m[1]; + } else { + $new_prefix=""; + } + wfProfileOut( "$fname-setup" ); - $s = ""; - - if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt - $text = $m[2]; - # fix up urlencoded title texts - if(preg_match("/%/", $m[1] )) $m[1] = urldecode($m[1]); - $trail = $m[3]; - } else { # Invalid form; output directly - $s .= $prefix . "[[" . $line ; - wfProfileOut( $fname ); - return $s; - } - - /* Valid link forms: - Foobar -- normal - :Foobar -- override special treatment of prefix (images, language links) - /Foobar -- convert to CurrentPage/Foobar - /Foobar/ -- convert to CurrentPage/Foobar, strip the initial / from text - */ - $c = substr($m[1],0,1); - $noforce = ($c != ":"); - if( $c == "/" ) { # subpage - if(substr($m[1],-1,1)=="/") { # / at end means we don't want the slash to be shown - $m[1]=substr($m[1],1,strlen($m[1])-2); - $noslash=$m[1]; - } else { - $noslash=substr($m[1],1); + + foreach ( $a as $line ) { + $prefix = $new_prefix; + + if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt + $text = $m[2]; + # fix up urlencoded title texts + if(preg_match("/%/", $m[1] )) $m[1] = urldecode($m[1]); + $trail = $m[3]; + } else { # Invalid form; output directly + $s .= $prefix . "[[" . $line ; + wfProfileOut( $fname ); + continue; } - if(!empty($wgNamespacesWithSubpages[$this->mTitle->getNamespace()])) { # subpages allowed here - $link = $this->mTitle->getPrefixedText(). "/" . trim($noslash); - if( "" == $text ) { - $text= $m[1]; - } # this might be changed for ugliness reasons + + /* Valid link forms: + Foobar -- normal + :Foobar -- override special treatment of prefix (images, language links) + /Foobar -- convert to CurrentPage/Foobar + /Foobar/ -- convert to CurrentPage/Foobar, strip the initial / from text + */ + $c = substr($m[1],0,1); + $noforce = ($c != ":"); + if( $c == "/" ) { # subpage + if(substr($m[1],-1,1)=="/") { # / at end means we don't want the slash to be shown + $m[1]=substr($m[1],1,strlen($m[1])-2); + $noslash=$m[1]; + } else { + $noslash=substr($m[1],1); + } + if(!empty($wgNamespacesWithSubpages[$this->mTitle->getNamespace()])) { # subpages allowed here + $link = $this->mTitle->getPrefixedText(). "/" . trim($noslash); + if( "" == $text ) { + $text= $m[1]; + } # this might be changed for ugliness reasons + } else { + $link = $noslash; # no subpage allowed, use standard link + } + } elseif( $noforce ) { # no subpage + $link = $m[1]; } else { - $link = $noslash; # no subpage allowed, use standard link + $link = substr( $m[1], 1 ); } - } elseif( $noforce ) { # no subpage - $link = $m[1]; - } else { - $link = substr( $m[1], 1 ); - } - $wasblank = ( "" == $text ); - if( $wasblank ) + $wasblank = ( "" == $text ); + if( $wasblank ) $text = $link; - $nt = Title::newFromText( $link ); - if( !$nt ) { - $s .= $prefix . "[[" . $line; - wfProfileOut( $fname ); - return $s; - } - $ns = $nt->getNamespace(); - $iw = $nt->getInterWiki(); - if( $noforce ) { - if( $iw && $this->mOptions->getInterwikiMagic() && $nottalk && $wgLang->getLanguageName( $iw ) ) { - array_push( $this->mOutput->mLanguageLinks, $nt->getPrefixedText() ); - $s .= $prefix . $trail ; + $nt = Title::newFromText( $link ); + if( !$nt ) { + $s .= $prefix . "[[" . $line; wfProfileOut( $fname ); - return (trim($s) == '')? '': $s; + continue; } - if( $ns == $image ) { - $s .= $prefix . $sk->makeImageLinkObj( $nt, $text ) . $trail; - $wgLinkCache->addImageLinkObj( $nt ); - wfProfileOut( $fname ); - return $s; + $ns = $nt->getNamespace(); + $iw = $nt->getInterWiki(); + if( $noforce ) { + if( $iw && $this->mOptions->getInterwikiMagic() && $nottalk && $wgLang->getLanguageName( $iw ) ) { + array_push( $this->mOutput->mLanguageLinks, $nt->getPrefixedText() ); + $s .= $prefix . $trail ; + wfProfileOut( $fname ); + return (trim($s) == '')? '': $s; + continue; + } + if ( $ns == $image ) { + $s .= $prefix . $sk->makeImageLinkObj( $nt, $text ) . $trail; + $wgLinkCache->addImageLinkObj( $nt ); + wfProfileOut( $fname ); + continue; + } else if ( $ns == $category ) { + $t = $nt->getText() ; + $nnt = Title::newFromText ( Namespace::getCanonicalName($category).":".$t ) ; + + $wgLinkCache->suspend(); # Don't save in links/brokenlinks + $t = $sk->makeLinkObj( $nnt, $t, "", "" , $prefix ); + $wgLinkCache->resume(); + + $sortkey = $wasblank ? $this->mTitle->getPrefixedText() : $text; + $wgLinkCache->addCategoryLinkObj( $nt, $sortkey ); + $this->mOutput->mCategoryLinks[] = $t ; + $s .= $prefix . $trail ; + wfProfileOut( $fname ); + continue; + } } - if ( $ns == $category ) { - $t = $nt->getText() ; - $nnt = Title::newFromText ( Namespace::getCanonicalName($category).":".$t ) ; - - $wgLinkCache->suspend(); # Don't save in links/brokenlinks - $t = $sk->makeLinkObj( $nnt, $t, "", "" , $prefix ); - $wgLinkCache->resume(); - - $sortkey = $wasblank ? $this->mTitle->getPrefixedText() : $text; - $wgLinkCache->addCategoryLinkObj( $nt, $sortkey ); - $this->mOutput->mCategoryLinks[] = $t ; - $s .= $prefix . $trail ; + if( ( $nt->getPrefixedText() == $this->mTitle->getPrefixedText() ) && + ( strpos( $link, "#" ) == FALSE ) ) { + # Self-links are handled specially; generally de-link and change to bold. + $s .= $prefix . $sk->makeSelfLinkObj( $nt, $text, "", $trail ); wfProfileOut( $fname ); - return $s ; + continue; } - } - if( ( $nt->getPrefixedText() == $this->mTitle->getPrefixedText() ) && - ( strpos( $link, "#" ) == FALSE ) ) { - # Self-links are handled specially; generally de-link and change to bold. - $s .= $prefix . $sk->makeSelfLinkObj( $nt, $text, "", $trail ); - wfProfileOut( $fname ); - return $s; - } - if( $ns == $media ) { - $s .= $prefix . $sk->makeMediaLinkObj( $nt, $text ) . $trail; - $wgLinkCache->addImageLinkObj( $nt ); - wfProfileOut( $fname ); - return $s; - } elseif( $ns == $special ) { - $s .= $prefix . $sk->makeKnownLinkObj( $nt, $text, "", $trail ); - wfProfileOut( $fname ); - return $s; + if( $ns == $media ) { + $s .= $prefix . $sk->makeMediaLinkObj( $nt, $text ) . $trail; + $wgLinkCache->addImageLinkObj( $nt ); + wfProfileOut( $fname ); + continue; + } elseif( $ns == $special ) { + $s .= $prefix . $sk->makeKnownLinkObj( $nt, $text, "", $trail ); + wfProfileOut( $fname ); + continue; + } + $s .= $sk->makeLinkObj( $nt, $text, "", $trail , $prefix ); } - $s .= $sk->makeLinkObj( $nt, $text, "", $trail , $prefix ); - wfProfileOut( $fname ); return $s; } @@ -1951,25 +1754,16 @@ class Parser return $full; } - /* private */ function doMagicISBN( &$tokenizer ) + /* private */ function magicISBN( $text ) { global $wgLang; - # Check whether next token is a text token - # If yes, fetch it and convert the text into a - # Special::BookSources link - $token = $tokenizer->previewToken(); - while ( $token["type"] == "" ) - { - $tokenizer->nextToken(); - $token = $tokenizer->previewToken(); - } - if ( $token["type"] == "text" ) - { - $token = $tokenizer->nextToken(); - $x = $token["text"]; - $valid = "0123456789-ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + $a = split( "ISBN ", " $text" ); + if ( count ( $a ) < 2 ) return $text; + $text = substr( array_shift( $a ), 1); + $valid = "0123456789-ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + foreach ( $a as $x ) { $isbn = $blank = "" ; while ( " " == $x{0} ) { $blank .= " "; @@ -1983,38 +1777,27 @@ class Parser $num = str_replace( " ", "", $num ); if ( "" == $num ) { - $text = "ISBN $blank$x"; + $text .= "ISBN $blank$x"; } else { $titleObj = Title::makeTitle( NS_SPECIAL, "Booksources" ); - $text = "escapeLocalUrl( "isbn={$num}" ) . "\" class=\"internal\">ISBN $isbn"; $text .= $x; } - } else { - $text = "ISBN "; } return $text; } - /* private */ function doMagicRFC( &$tokenizer ) + /* private */ function magicRFC( $text ) { global $wgLang; - # Check whether next token is a text token - # If yes, fetch it and convert the text into a - # link to an RFC source - $token = $tokenizer->previewToken(); - while ( $token["type"] == "" ) - { - $tokenizer->nextToken(); - $token = $tokenizer->previewToken(); - } - if ( $token["type"] == "text" ) - { - $token = $tokenizer->nextToken(); - $x = $token["text"]; - $valid = "0123456789"; + $a = split( "ISBN ", " $text" ); + if ( count ( $a ) < 2 ) return $text; + $text = substr( array_shift( $a ), 1); + $valid = "0123456789"; + foreach ( $a as $x ) { $rfc = $blank = "" ; while ( " " == $x{0} ) { $blank .= " "; @@ -2032,10 +1815,8 @@ class Parser $url = str_replace( "$1", $rfc, $url); $sk =& $this->mOptions->getSkin(); $la = $sk->getExternalLinkAttributes( $url, "RFC {$rfc}" ); - $text = "RFC {$rfc}{$x}"; + $text .= "RFC {$rfc}{$x}"; } - } else { - $text = "RFC "; } return $text; } -- 2.20.1