From: Jens Frank <jeluf@users.mediawiki.org>
Date: Sat, 24 Apr 2004 10:34:22 +0000 (+0000)
Subject: Handle French typographical rules for spaces before and after punctuation
X-Git-Tag: 1.3.0beta1~287
X-Git-Url: http://git.cyclocoop.org/url?a=commitdiff_plain;h=b6063739058a255611756277d0814dbd6dae2dc7;p=lhc%2Fweb%2Fwiklou.git

Handle French typographical rules for spaces before and after punctuation
---

diff --git a/includes/Parser.php b/includes/Parser.php
index 6a39244396..3a1d83086c 100644
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -678,6 +678,13 @@ class Parser
 					# simple text with no further markup
 					$txt = $token["text"];
 					break;
+				case "blank":
+					# Text that contains blanks that have to be converted to
+					# non-breakable spaces for French.
+					# U+202F NARROW NO-BREAK SPACE might be a better choice, but
+					# browser support for Unicode spacing is poor.
+					$txt = str_replace( " ", "<u>&nbsp;</u>", $token["text"] );
+					break;
 				case "[[[":
 					# remember the tag opened with 3 [
 					$threeopen = true;
diff --git a/includes/Tokenizer.php b/includes/Tokenizer.php
index 05754226de..f23df7fb01 100644
--- a/includes/Tokenizer.php
+++ b/includes/Tokenizer.php
@@ -160,6 +160,64 @@ class Tokenizer {
 							}
 							break 2;
 						}
+						break;
+					case "!": // French spacing rules have a space before exclamation
+					case "?": // and question marks. Those have to become &nbsp;
+					case ":": // And colons, Hashar says ...
+						if ( $this->preceeded( " " ) )
+						{
+							// strip blank from Token
+							$token["text"] = substr( $token["text"], 0, -1 );
+							$queueToken["type"] = "blank";
+							$queueToken["text"] = " {$ch}";
+							$this->mQueuedToken[] = $queueToken;
+							$this->mPos ++;
+							break 2; // switch + while
+						}
+						break;
+					case "0": // A space between two numbers is used to ease reading
+					case "1": // of big numbers, e.g. 1 000 000. Those spaces need
+					case "2": // to be unbreakable
+					case "3":
+					case "4":
+					case "5":
+					case "6":
+					case "7":
+					case "8":
+					case "9":
+						if (    ($this->mTextLength >= $this->mPos +2)
+						     && ($this->mText[$this->mPos+1] == " ")
+						     && ctype_digit( $this->mText[$this->mPos+2] ) )
+						{
+							$queueToken["type"] = "blank";
+							$queueToken["text"] = $ch . " ";
+							$this->mQueuedToken[] = $queueToken;
+							$this->mPos += 2;
+							break 2; // switch + while
+						}
+						break;
+					case "\302": // first byte of UTF-8 Character Guillemet-left
+						if ( $this->continues( "\253 ") ) // second byte and a blank
+						{
+							$queueToken["type"] = "blank";
+							$queueToken["text"] = "\302\253 ";
+							$this->mQueuedToken[] = $queueToken;
+							$this->mPos += 3;
+							break 2; // switch + while
+						}
+						break;
+					case "\273": //last byte of UTF-8 Character Guillemet-right
+						if ( $this->preceeded( " \302" ) )
+						{
+							$queueToken["type"] = "blank";
+							$queueToken["text"] = " \302\273";
+							$token["text"] = substr( $token["text"], 0, -2 );
+							$this->mQueuedToken[] = $queueToken;
+							$this->mPos ++;
+							break 2; // switch + while
+						}
+						break;
+
 				} /* switch */
 				$token["text"].=$ch;
 				$this->mPos ++;
@@ -185,6 +243,16 @@ class Tokenizer {
 		}
 		return true;
 	}
-		
+
+	// function preceeded
+	// checks whether the mText is preceeded by $prec at position mPos
+	/* private */ function preceeded( $prec )
+	{
+		$len = strlen( $prec );
+		// if $prec is longer than the text up to mPos, return false
+		if ( $this->mPos < $len )
+			return false;
+		return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
+	}
 }