From: Jens Frank Date: Tue, 2 Mar 2004 20:23:56 +0000 (+0000) Subject: Added hook to tokenizer and to parser for language specific X-Git-Tag: 1.3.0beta1~880 X-Git-Url: http://git.cyclocoop.org/%24href?a=commitdiff_plain;h=e5306123c1044776077953a8717dfcff4f8f1e7c;p=lhc%2Fweb%2Fwiklou.git Added hook to tokenizer and to parser for language specific processing. Using this hook, added a conversion of spaces to non-breaking spaces for the French wikipedia. Switched ----- ->
processing to tokenizer. --- diff --git a/includes/Parser.php b/includes/Parser.php index 2e4e802dac..e92f6c345d 100644 --- a/includes/Parser.php +++ b/includes/Parser.php @@ -360,7 +360,7 @@ class Parser $text = $this->removeHTMLtags( $text ); $text = $this->replaceVariables( $text ); - $text = preg_replace( "/(^|\n)-----*/", "\\1
", $text ); + # $text = preg_replace( "/(^|\n)-----*/", "\\1
", $text ); $text = str_replace ( "
", "
", $text ); $text = $this->doHeadings( $text ); @@ -542,6 +542,8 @@ class Parser /* private */ function replaceInternalLinks( $str ) { + global $wgLang; # for language specific parser hook + $tokenizer=Tokenizer::newFromString( $str ); $tokenStack = array(); @@ -596,6 +598,9 @@ class Parser } $tagIsOpen = (count( $tokenStack ) != 0); break; + case "----": + $txt = "\n
\n"; + break; case "'''": # This and the three next ones handle quotes $txt = $this->handle3Quotes( $state, $token ); @@ -611,9 +616,13 @@ class Parser $txt=""; break; default: - # An unkown token. Highlight. - $txt = "".$token["type"].""; - $txt .= "".$token["text"].""; + # Call language specific Hook. + $txt = $wgLang->processToken( $token, $tokenStack ); + if ( NULL == $txt ) { + # An unkown token. Highlight. + $txt = "".$token["type"].""; + $txt .= "".$token["text"].""; + } break; } # If we're parsing the interior of a link, don't append the interior to $s, diff --git a/includes/Tokenizer.php b/includes/Tokenizer.php index d7eb080b73..beeda47466 100644 --- a/includes/Tokenizer.php +++ b/includes/Tokenizer.php @@ -26,22 +26,27 @@ class Tokenizer { function preParse() { global $wgLang; + + # build up the regex, step by step. + # Basic features: Quotes for / and hyphens for
+ $regex = "\'\'\'\'\'|\'\'\'|\'\'|\n-----*"; + # Append regex for linkPrefixExtension if ( $wgLang->linkPrefixExtension() ) { - $regex = "/(([a-zA-Z\x80-\xff]+)\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/"; - # 000000000000000000000000000000000000000000000000000000 - # 1111111111111111111111111111111111111111111111111111 - # 222222222222222222 - # which $this->mMatch[...] will contain the match. + $regex .= "|([a-zA-Z\x80-\xff]+)\[\["; } else { - $regex = "/(\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/"; + $regex .= "|\[\["; } + # Closing link + $regex .= "|\]\]"; + # Language-specific additions + $regex .= $wgLang->tokenizerRegex(); + # Finalize regex + $regex = "/(" . $regex . ")/"; + # Apply the regex to the text $this->mCount = preg_match_all( $regex, $this->mText, $this->mMatch, PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE); $this->mMatchPos=0; - # print( "
" );
-		# print_r( $this->mMatch );
-		# print( "
" ); } function nextToken() @@ -76,6 +81,12 @@ class Tokenizer { $token["text"] = $this->mMatch[2][$this->mMatchPos][0]; # the prefix } else { $token["type"] = $this->mMatch[0][$this->mMatchPos][0]; + if ( substr($token["type"],1,4) == "----" ) + { + # any number of hyphens bigger than four is a
. + # strip down to four. + $token["type"]="----"; + } } # What the pointers would change to if this would not just be a preview $token["mPos"] = $this->mPos + strlen( $this->mMatch[0][$this->mMatchPos][0] ); diff --git a/languages/Language.php b/languages/Language.php index 100e4cf17e..df7e39646d 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1732,6 +1732,20 @@ class Language { { return "$text"; } + + # returns additional Regex for the tokenizer. See LanguageFr.php for an example + function tokenizerRegex() + { + return ""; + } + + # Process the token generated from the tokenizer by the above regex. Return + # NULL if the token is unknown, and the text to be added to the output otherwise + function processToken( &$token , &$tokenStack) + { + return NULL; + } + } @include_once( "Language" . ucfirst( $wgLanguageCode ) . ".php" ); diff --git a/languages/LanguageFr.php b/languages/LanguageFr.php index 2e1a858701..86ee9a6f52 100644 --- a/languages/LanguageFr.php +++ b/languages/LanguageFr.php @@ -1066,6 +1066,32 @@ class LanguageFr extends Language else return $m; } + + # returns additional Regex for the tokenizer. + function tokenizerRegex() + { + return "| [:»!?]|« |[0-9] [0-9]"; + } + + # Process the token generated from the tokenizer by the above regex. Return + # NULL if the token is unknown, and the text to be added to the output otherwise + function processToken( &$token , &$tokenStack) + { + if ( preg_match( "/ ([:»!?])/", $token["type"], $m ) ) + { + $txt = " " . $m[1]; + } elseif ( "« " == $token["type"] ) + { + $txt = "« "; + } elseif ( preg_match( "/([0-9]) ([0-9])/", $token["type"], $m ) ) + { + $txt = $m[1] . " " . $m[2]; + } else + { + $txt = NULL; + } + return $txt; + } } ?>