From: Jens Frank Date: Sat, 28 Feb 2004 23:38:08 +0000 (+0000) Subject: Added real parser/tokenizer. X-Git-Tag: 1.3.0beta1~912 X-Git-Url: http://git.cyclocoop.org/%28?a=commitdiff_plain;h=7c2a2d58464077ac862afd700c8d121dff263e37;p=lhc%2Fweb%2Fwiklou.git Added real parser/tokenizer. Tokenizer is a new class that splits a text into tokens. Parser calls the tokenizer to get one token by another and handle them one by one. Parser:doAllQuotes and Parser:replaceInternalLinks have been replaced by the new parser. Image thumbnailing now allows links in the captions. --- diff --git a/includes/Parser.php b/includes/Parser.php index 3b883c082d..d9b59572ac 100644 --- a/includes/Parser.php +++ b/includes/Parser.php @@ -1,5 +1,7 @@ ", $text ); $text = str_replace ( "
", "
", $text ); - $text = $this->doAllQuotes( $text ); $text = $this->doHeadings( $text ); $text = $this->doBlockLevels( $text, $linestart ); @@ -377,63 +378,6 @@ function doTableStuff ( $t ) return $text; } - /* private */ function doAllQuotes( $text ) - { - $outtext = ""; - $lines = explode( "\r\n", $text ); - foreach ( $lines as $line ) { - $outtext .= $this->doQuotes ( "", $line, "" ) . "\r\n"; - } - return $outtext; - } - - /* private */ function doQuotes( $pre, $text, $mode ) - { - if ( preg_match( "/^(.*)''(.*)$/sU", $text, $m ) ) { - $m1_strong = ($m[1] == "") ? "" : "{$m[1]}"; - $m1_em = ($m[1] == "") ? "" : "{$m[1]}"; - if ( substr ($m[2], 0, 1) == "'" ) { - $m[2] = substr ($m[2], 1); - if ($mode == "em") { - return $this->doQuotes ( $m[1], $m[2], ($m[1] == "") ? "both" : "emstrong" ); - } else if ($mode == "strong") { - return $m1_strong . $this->doQuotes ( "", $m[2], "" ); - } else if (($mode == "emstrong") || ($mode == "both")) { - return $this->doQuotes ( "", $pre.$m1_strong.$m[2], "em" ); - } else if ($mode == "strongem") { - return "{$pre}{$m1_em}" . $this->doQuotes ( "", $m[2], "em" ); - } else { - return $m[1] . $this->doQuotes ( "", $m[2], "strong" ); - } - } else { - if ($mode == "strong") { - return $this->doQuotes ( $m[1], $m[2], ($m[1] == "") ? "both" : "strongem" ); - } else if ($mode == "em") { - return $m1_em . $this->doQuotes ( "", $m[2], "" ); - } else if ($mode == "emstrong") { - return "{$pre}{$m1_strong}" . $this->doQuotes ( "", $m[2], "strong" ); - } else if (($mode == "strongem") || ($mode == "both")) { - return $this->doQuotes ( "", $pre.$m1_em.$m[2], "strong" ); - } else { - return $m[1] . $this->doQuotes ( "", $m[2], "em" ); - } - } - } else { - $text_strong = ($text == "") ? "" : "{$text}"; - $text_em = ($text == "") ? "" : "{$text}"; - if ($mode == "") { - return $pre . $text; - } else if ($mode == "em") { - return $pre . $text_em; - } else if ($mode == "strong") { - return $pre . $text_strong; - } else if ($mode == "strongem") { - return (($pre == "") && ($text == "")) ? "" : "{$pre}{$text_em}"; - } else { - return (($pre == "") && ($text == "")) ? "" : "{$pre}{$text_strong}"; - } - } - } /* private */ function doHeadings( $text ) { @@ -531,139 +475,301 @@ function doTableStuff ( $t ) return $s; } - /* private */ function replaceInternalLinks( $s ) + /* private */ function handle3Quotes( &$state, $token ) + { + if ( $state["strong"] ) { + if ( $state["em"] && $state["em"] > $state["strong"] ) + { + # ''' lala ''lala ''' + $s = ""; + } else { + $s = ""; + } + $state["strong"] = FALSE; + } else { + $s = ""; + $state["strong"] = $token["pos"]; + } + return $s; + } + + /* private */ function handle2Quotes( &$state, $token ) + { + if ( $state["em"] ) { + if ( $state["strong"] && $state["strong"] > $state["em"] ) + { + # ''lala'''lala'' ....''' + $s = ""; + } else { + $s = ""; + } + $state["em"] = FALSE; + } else { + $s = ""; + $state["em"] = $token["pos"]; + } + return $s; + } + + /* private */ function handle5Quotes( &$state, $token ) + { + if ( $state["em"] && $state["strong"] ) { + if ( $state["em"] < $state["strong"] ) { + $s .= ""; + } else { + $s .= ""; + } + $state["strong"] = $state["em"] = FALSE; + } elseif ( $state["em"] ) { + $s .= ""; + $state["em"] = FALSE; + $state["strong"] = $token["pos"]; + } elseif ( $state["strong"] ) { + $s .= ""; + $state["strong"] = FALSE; + $state["em"] = $token["pos"]; + } else { # not $em and not $strong + $s .= ""; + $state["strong"] = $state["em"] = $token["pos"]; + } + return $s; + } + + /* private */ function replaceInternalLinks( $str ) + { + $tokenizer=Tokenizer::newFromString( $str ); + $tokenStack = array(); + + $s=""; + $state["em"] = FALSE; + $state["strong"] = FALSE; + $tagIsOpen = FALSE; + + # The tokenizer splits the text into tokens and returns them one by one. + # Every call to the tokenizer returns a new token. + while ( $token = $tokenizer->nextToken() ) + { + switch ( $token["type"] ) + { + case "text": + # simple text with no further markup + $txt = $token["text"]; + break; + case "[[": + # link opening tag. + # FIXME : Treat orphaned open tags (stack not empty when text is over) + $tagIsOpen = TRUE; + array_push( $tokenStack, $token ); + $txt=""; + break; + case "]]": + # link close tag. + # get text from stack, glue it together, and call the code to handle a + # link + if ( count( $tokenStack ) == 0 ) + { + # stack empty. Found a ]] without an opening [[ + $txt = "]]"; + } else { + $linkText = ""; + $lastToken = array_pop( $tokenStack ); + while ( $lastToken["type"] != "[[" ) + { + $linkText = $lastToken["text"] . $linkText; + $lastToken = array_pop( $tokenStack ); + } + $txt = $linkText ."]]"; + $nextToken = $tokenizer->previewToken(); + if ( $nextToken["type"] == "text" ) + { + # Preview just looks at it. Now we have to fetch it. + $nextToken = $tokenizer->nextToken(); + $txt .= $nextToken["text"]; + } + $txt = $this->handleInternalLink( $txt ); + #$txt = "<" . $txt . ">"; + } + $tagIsOpen = (count( $tokenStack ) != 0); + break; + case "'''": + # This and the three next ones handle quotes + $txt = $this->handle3Quotes( $state, $token ); + break; + case "''": + $txt = $this->handle2Quotes( $state, $token ); + break; + case "'''''": + $txt = $this->handle5Quotes( $state, $token ); + break; + case "": + # empty token + $txt=""; + break; + default: + # An unkown token. Highlight. + $txt = "".$token["type"].""; + $txt .= "".$token["text"].""; + break; + } + # If we're parsing the interior of a link, don't append the interior to $s, + # but push it to the stack so it can be processed when a ]] token is found. + if ( $tagIsOpen && $txt != "" ) { + $token["type"] = "text"; + $token["text"] = $txt; + array_push( $tokenStack, $token ); + } else { + $s .= $txt; + } + } #end while + if ( count( $tokenStack ) != 0 ) + { + # still objects on stack. opened [[ tag without closing ]] tag. + $txt = ""; + while ( $lastToken = array_pop( $tokenStack ) ) + { + if ( $lastToken["type"] == "text" ) + { + $txt = $lastToken["text"] . $txt; + } else { + $txt = $lastToken["type"] . $txt; + } + } + $s .= $txt; + } + return $s; + } + + /* private */ function handleInternalLink( $line ) { global $wgTitle, $wgUser, $wgLang; global $wgLinkCache, $wgInterwikiMagic, $wgUseCategoryMagic; global $wgNamespacesWithSubpages, $wgLanguageCode; - wfProfileIn( $fname = "OutputPage::replaceInternalLinks" ); + static $fname = "OutputPage::replaceInternalLinks" ; + wfProfileIn( $fname ); wfProfileIn( "$fname-setup" ); - $tc = Title::legalChars() . "#"; - $sk = $wgUser->getSkin(); - - $a = explode( "[[", " " . $s ); - $s = array_shift( $a ); - $s = substr( $s, 1 ); + static $tc = FALSE; + static $sk = FALSE; + if ( !$tc ) { $tc = Title::legalChars() . "#"; } + if ( !$sk ) { $sk = $wgUser->getSkin(); } # Match a link having the form [[namespace:link|alternate]]trail - $e1 = "/^([{$tc}]+)(?:\\|([^]]+))?]](.*)\$/sD"; + static $e1 = FALSE; + if ( !$e1 ) { $e1 = "/^([{$tc}]+)(?:\\|([^]]+))?]](.*)\$/sD"; } # Match the end of a line for a word that's not followed by whitespace, # e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched #$e2 = "/^(.*)\\b(\\w+)\$/suD"; #$e2 = "/^(.*\\s)(\\S+)\$/suD"; - $e2 = '/^(.*\s)([a-zA-Z\x80-\xff]+)$/sD'; + static $e2 = '/^(.*\s)([a-zA-Z\x80-\xff]+)$/sD'; # Special and Media are pseudo-namespaces; no pages actually exist in them - $image = Namespace::getImage(); - $special = Namespace::getSpecial(); - $media = Namespace::getMedia(); - $category = wfMsg ( "category" ) ; - $nottalk = !Namespace::isTalk( $wgTitle->getNamespace() ); + static $image = FALSE; + static $special = FALSE; + static $media = FALSE; + static $category = FALSE; + static $nottalk = ""; + if ( !$image ) { $image = Namespace::getImage(); } + if ( !$special ) { $special = Namespace::getSpecial(); } + if ( !$media ) { $media = Namespace::getMedia(); } + if ( !$category ) { $category = wfMsg ( "category" ) ; } + if ( $nottalk=="" ) { $nottalk = !Namespace::isTalk( $wgTitle->getNamespace() ); } - if ( $wgLang->linkPrefixExtension() && preg_match( $e2, $s, $m ) ) { - $new_prefix = $m[2]; - $s = $m[1]; - } else { - $new_prefix=""; - } wfProfileOut( "$fname-setup" ); - foreach ( $a as $line ) { - $prefix = $new_prefix; - if ( $wgLang->linkPrefixExtension() && preg_match( $e2, $line, $m ) ) { - $new_prefix = $m[2]; - $line = $m[1]; + $prefix = $new_prefix; + if ( $wgLang->linkPrefixExtension() && preg_match( $e2, $line, $m ) ) { + $new_prefix = $m[2]; + $line = $m[1]; + } else { + $new_prefix = ""; + } + if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt + $text = $m[2]; + $trail = $m[3]; + } else { # Invalid form; output directly + $s .= $prefix . "[[" . $line ; + return $s; + } + + /* Valid link forms: + Foobar -- normal + :Foobar -- override special treatment of prefix (images, language links) + /Foobar -- convert to CurrentPage/Foobar + /Foobar/ -- convert to CurrentPage/Foobar, strip the initial / from text + */ + $c = substr($m[1],0,1); + $noforce = ($c != ":"); + if( $c == "/" ) { # subpage + if(substr($m[1],-1,1)=="/") { # / at end means we don't want the slash to be shown + $m[1]=substr($m[1],1,strlen($m[1])-2); + $noslash=$m[1]; } else { - $new_prefix = ""; + $noslash=substr($m[1],1); } - if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt - $text = $m[2]; - $trail = $m[3]; - } else { # Invalid form; output directly - $s .= $prefix . "[[" . $line ; - continue; - } - - /* Valid link forms: - Foobar -- normal - :Foobar -- override special treatment of prefix (images, language links) - /Foobar -- convert to CurrentPage/Foobar - /Foobar/ -- convert to CurrentPage/Foobar, strip the initial / from text - */ - $c = substr($m[1],0,1); - $noforce = ($c != ":"); - if( $c == "/" ) { # subpage - if(substr($m[1],-1,1)=="/") { # / at end means we don't want the slash to be shown - $m[1]=substr($m[1],1,strlen($m[1])-2); - $noslash=$m[1]; - } else { - $noslash=substr($m[1],1); - } - if($wgNamespacesWithSubpages[$wgTitle->getNamespace()]) { # subpages allowed here - $link = $wgTitle->getPrefixedText(). "/" . trim($noslash); - if( "" == $text ) { - $text= $m[1]; - } # this might be changed for ugliness reasons - } else { - $link = $noslash; # no subpage allowed, use standard link - } - } elseif( $noforce ) { # no subpage - $link = $m[1]; + if($wgNamespacesWithSubpages[$wgTitle->getNamespace()]) { # subpages allowed here + $link = $wgTitle->getPrefixedText(). "/" . trim($noslash); + if( "" == $text ) { + $text= $m[1]; + } # this might be changed for ugliness reasons } else { - $link = substr( $m[1], 1 ); + $link = $noslash; # no subpage allowed, use standard link } - if( "" == $text ) - $text = $link; + } elseif( $noforce ) { # no subpage + $link = $m[1]; + } else { + $link = substr( $m[1], 1 ); + } + if( "" == $text ) + $text = $link; - $nt = Title::newFromText( $link ); - if( !$nt ) { - $s .= $prefix . "[[" . $line; - continue; - } - $ns = $nt->getNamespace(); - $iw = $nt->getInterWiki(); - if( $noforce ) { - if( $iw && $wgInterwikiMagic && $nottalk && $wgLang->getLanguageName( $iw ) ) { - array_push( $this->mOutput->mLanguageLinks, $nt->getPrefixedText() ); - $s .= $prefix . $trail; - continue; - } - if( $ns == $image ) { - $s .= $prefix . $sk->makeImageLinkObj( $nt, $text ) . $trail; - $wgLinkCache->addImageLinkObj( $nt ); - continue; - } - } - if( ( $nt->getPrefixedText() == $wgTitle->getPrefixedText() ) && - ( strpos( $link, "#" ) == FALSE ) ) { - $s .= $prefix . "" . $text . "" . $trail; - continue; - } - if ( $ns == $category && $wgUseCategoryMagic ) { - $t = explode ( ":" , $nt->getText() ) ; - array_shift ( $t ) ; - $t = implode ( ":" , $t ) ; - $t = $wgLang->ucFirst ( $t ) ; -# $t = $sk->makeKnownLink( $category.":".$t, $t, "", $trail , $prefix ); - $nnt = Title::newFromText ( $category.":".$t ) ; - $t = $sk->makeLinkObj( $nnt, $t, "", $trail , $prefix ); - $this->mCategoryLinks[] = $t ; - $s .= $prefix . $trail ; - continue ; + $nt = Title::newFromText( $link ); + if( !$nt ) { + $s .= $prefix . "[[" . $line; + return $s; + } + $ns = $nt->getNamespace(); + $iw = $nt->getInterWiki(); + if( $noforce ) { + if( $iw && $wgInterwikiMagic && $nottalk && $wgLang->getLanguageName( $iw ) ) { + array_push( $this->mOutput->mLanguageLinks, $nt->getPrefixedText() ); + $s .= $prefix . $trail; + return $s; } - if( $ns == $media ) { - $s .= $prefix . $sk->makeMediaLinkObj( $nt, $text ) . $trail; + if( $ns == $image ) { + $s .= $prefix . $sk->makeImageLinkObj( $nt, $text ) . $trail; $wgLinkCache->addImageLinkObj( $nt ); - continue; - } elseif( $ns == $special ) { - $s .= $prefix . $sk->makeKnownLinkObj( $nt, $text, "", $trail ); - continue; + return $s; } - $s .= $sk->makeLinkObj( $nt, $text, "", $trail , $prefix ); } + if( ( $nt->getPrefixedText() == $wgTitle->getPrefixedText() ) && + ( strpos( $link, "#" ) == FALSE ) ) { + $s .= $prefix . "" . $text . "" . $trail; + return $s; + } + if ( $ns == $category && $wgUseCategoryMagic ) { + $t = explode ( ":" , $nt->getText() ) ; + array_shift ( $t ) ; + $t = implode ( ":" , $t ) ; + $t = $wgLang->ucFirst ( $t ) ; +# $t = $sk->makeKnownLink( $category.":".$t, $t, "", $trail , $prefix ); + $nnt = Title::newFromText ( $category.":".$t ) ; + $t = $sk->makeLinkObj( $nnt, $t, "", $trail , $prefix ); + $this->mCategoryLinks[] = $t ; + $s .= $prefix . $trail ; + return $s ; + } + if( $ns == $media ) { + $s .= $prefix . $sk->makeMediaLinkObj( $nt, $text ) . $trail; + $wgLinkCache->addImageLinkObj( $nt ); + return $s; + } elseif( $ns == $special ) { + $s .= $prefix . $sk->makeKnownLinkObj( $nt, $text, "", $trail ); + return $s; + } + $s .= $sk->makeLinkObj( $nt, $text, "", $trail , $prefix ); + wfProfileOut( $fname ); return $s; } diff --git a/includes/Skin.php b/includes/Skin.php index d933a110d7..0bd52f44a8 100644 --- a/includes/Skin.php +++ b/includes/Skin.php @@ -1636,7 +1636,9 @@ class Skin { $url = wfImageUrl( $name ); $path = wfImagePath( $name ); - $label = htmlspecialchars( $label ); + #$label = htmlspecialchars( $label ); + $alt = preg_replace( "/<[^>]*>/", "", $label); + $alt = htmlspecialchars( $alt ); list($width, $height, $type, $attr) = getimagesize( $path ); $boxheight = intval( $height/($width/$boxwidth) ); @@ -1649,11 +1651,11 @@ class Skin { $u = wfLocalUrlE( $link ); - $more = wfMsg( "thumbnail-more" ); + $more = htmlspecialchars(wfMsg( "thumbnail-more" )); $s = "
" . - "" . - "\"{$label}\"" . + "" . + "\"{$alt}\"" . "" . "\"{$more}\"" . "

{$label}

"; diff --git a/includes/Tokenizer.php b/includes/Tokenizer.php new file mode 100644 index 0000000000..5412e550f9 --- /dev/null +++ b/includes/Tokenizer.php @@ -0,0 +1,70 @@ +mPos=0; + } + + # factory function + function newFromString( $s ) + { + $t = new Tokenizer(); + $t->mText = $s; + $t->preParse(); + $t->mTextLength = strlen( $s ); + return $t; + } + + function preParse() + { + $this->mCount = preg_match_all( "/(\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/", + $this->mText, $this->mM, + PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE); + $this->mMPos=0; + } + + function nextToken() + { + $token = $this->previewToken(); + if ( $token ) { + if ( $token["type"] == "text" ) { + $this->mPos = $token["mPos"]; + } else { + $this->mMPos = $token["mMPos"]; + $this->mPos = $token["mPos"]; + } + } + return $token; + } + + + function previewToken() + { + if ( $this->mMPos <= $this->mCount ) { + $token["pos"] = $this->mPos; + if ( $this->mPos < $this->mM[0][$this->mMPos][1] ) { + $token["type"] = "text"; + $token["text"] = substr( $this->mText, $this->mPos, + $this->mM[0][$this->mMPos][1] - $this->mPos ); + $token["mPos"] = $this->mM[0][$this->mMPos][1]; + } else { + $token["type"] = $this->mM[0][$this->mMPos][0]; + $token["mPos"] = $this->mPos + strlen($token["type"]); + $token["mMPos"] = $this->mMPos + 1; + } + } elseif ( $this->mPos < $this->mTextLength ) { + $token["type"] = "text"; + $token["text"] = substr( $this->mText, $this->mPos ); + $token["mPos"] = $this->mTextLength; + } else { + $token = FALSE; + } + return $token; + } + + +} +