From: Jens Frank <jeluf@users.mediawiki.org>
Date: Tue, 2 Mar 2004 20:23:56 +0000 (+0000)
Subject: Added hook to tokenizer and to parser for language specific
X-Git-Tag: 1.3.0beta1~880
X-Git-Url: http://git.cyclocoop.org/%24href?a=commitdiff_plain;h=e5306123c1044776077953a8717dfcff4f8f1e7c;p=lhc%2Fweb%2Fwiklou.git

Added hook to tokenizer and to parser for language specific
processing.

Using this hook, added a conversion of spaces to non-breaking
spaces for the French wikipedia.

Switched -----  -> <hr> processing to tokenizer.
---

diff --git a/includes/Parser.php b/includes/Parser.php
index 2e4e802dac..e92f6c345d 100644
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -360,7 +360,7 @@ class Parser
 		$text = $this->removeHTMLtags( $text );
 		$text = $this->replaceVariables( $text );
 
-		$text = preg_replace( "/(^|\n)-----*/", "\\1<hr>", $text );
+		# $text = preg_replace( "/(^|\n)-----*/", "\\1<hr>", $text );
 		$text = str_replace ( "<HR>", "<hr>", $text );
 
 		$text = $this->doHeadings( $text );
@@ -542,6 +542,8 @@ class Parser
 
 	/* private */ function replaceInternalLinks( $str )
 	{
+		global $wgLang;	# for language specific parser hook
+
 		$tokenizer=Tokenizer::newFromString( $str );
 		$tokenStack = array();
 		
@@ -596,6 +598,9 @@ class Parser
 					}
 					$tagIsOpen = (count( $tokenStack ) != 0);
 					break;
+				case "----":
+					$txt = "\n<hr>\n";
+					break;
 				case "'''":
 					# This and the three next ones handle quotes
 					$txt = $this->handle3Quotes( $state, $token );
@@ -611,9 +616,13 @@ class Parser
 					$txt="";
 					break;
 				default:
-					# An unkown token. Highlight.
-					$txt = "<font color=\"#FF0000\"><b>".$token["type"]."</b></font>";
-					$txt .= "<font color=\"#FFFF00\"><b>".$token["text"]."</b></font>";
+					# Call language specific Hook.
+					$txt = $wgLang->processToken( $token, $tokenStack );
+					if ( NULL == $txt ) {
+						# An unkown token. Highlight.
+						$txt = "<font color=\"#FF0000\"><b>".$token["type"]."</b></font>";
+						$txt .= "<font color=\"#FFFF00\"><b>".$token["text"]."</b></font>";
+					}
 					break;
 			}
 			# If we're parsing the interior of a link, don't append the interior to $s,
diff --git a/includes/Tokenizer.php b/includes/Tokenizer.php
index d7eb080b73..beeda47466 100644
--- a/includes/Tokenizer.php
+++ b/includes/Tokenizer.php
@@ -26,22 +26,27 @@ class Tokenizer {
 	function preParse()
 	{
 		global $wgLang;
+
+		# build up the regex, step by step.
+		# Basic features: Quotes for <em>/<strong> and hyphens for <hr>
+		$regex = "\'\'\'\'\'|\'\'\'|\'\'|\n-----*";
+		# Append regex for linkPrefixExtension 
 		if (  $wgLang->linkPrefixExtension() ) {
-			$regex = "/(([a-zA-Z\x80-\xff]+)\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/";
-			#          000000000000000000000000000000000000000000000000000000
-			#           1111111111111111111111111111111111111111111111111111
-			#            222222222222222222
-			# which $this->mMatch[...] will contain the match.
+			$regex .= "|([a-zA-Z\x80-\xff]+)\[\[";
 		} else {
-			$regex = "/(\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/";
+			$regex .= "|\[\[";
 		}
+		# Closing link
+		$regex .= "|\]\]";
+		# Language-specific additions
+		$regex .= $wgLang->tokenizerRegex();
+		# Finalize regex
+		$regex = "/(" . $regex . ")/";
 
+		# Apply the regex to the text
 		$this->mCount = preg_match_all( $regex, $this->mText, $this->mMatch,
 						PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE);
 		$this->mMatchPos=0;
-		# print( "<pre>" );
-		# print_r( $this->mMatch );
-		# print( "</pre>" );
 	}
 
 	function nextToken()
@@ -76,6 +81,12 @@ class Tokenizer {
 					$token["text"] = $this->mMatch[2][$this->mMatchPos][0]; # the prefix
 				} else {
 					$token["type"] = $this->mMatch[0][$this->mMatchPos][0];
+					if ( substr($token["type"],1,4) == "----" )
+					{
+						# any number of hyphens bigger than four is a <HR>. 
+						# strip down to four.
+						$token["type"]="----";
+					}
 				}
 				# What the pointers would change to if this would not just be a preview
 				$token["mPos"] = $this->mPos + strlen( $this->mMatch[0][$this->mMatchPos][0] );
diff --git a/languages/Language.php b/languages/Language.php
index 100e4cf17e..df7e39646d 100644
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1732,6 +1732,20 @@ class Language {
 	{
 		return "<em>$text</em>";
 	}
+
+	# returns additional Regex for the tokenizer. See LanguageFr.php for an example
+	function tokenizerRegex()
+	{
+		return "";
+	}
+
+	# Process the token generated from the tokenizer by the above regex. Return
+	# NULL if the token is unknown, and the text to be added to the output otherwise
+	function processToken( &$token , &$tokenStack)
+	{
+		return NULL;
+	}
+
 }
 
 @include_once( "Language" . ucfirst( $wgLanguageCode ) . ".php" );
diff --git a/languages/LanguageFr.php b/languages/LanguageFr.php
index 2e1a858701..86ee9a6f52 100644
--- a/languages/LanguageFr.php
+++ b/languages/LanguageFr.php
@@ -1066,6 +1066,32 @@ class LanguageFr extends Language
 		else return $m;
 
 	}
+
+	# returns additional Regex for the tokenizer.
+	function tokenizerRegex()
+	{
+		return "| [:»!?]|« |[0-9] [0-9]";
+	}
+
+	# Process the token generated from the tokenizer by the above regex. Return
+	# NULL if the token is unknown, and the text to be added to the output otherwise
+	function processToken( &$token , &$tokenStack)
+	{
+		if ( preg_match( "/ ([:»!?])/", $token["type"], $m ) )
+		{
+			$txt = "&nbsp;" . $m[1];
+		} elseif ( "« " == $token["type"] )
+		{
+			$txt = "«&nbsp;";
+		} elseif ( preg_match( "/([0-9]) ([0-9])/", $token["type"], $m ) )
+		{
+			$txt = $m[1] . "&nbsp;" . $m[2];
+		} else
+		{
+			$txt = NULL;
+		}
+		return $txt;
+	}
 }
 
 ?>