From de2d186f8e29d89f2ff8ebb87b8b9610c4e7d5ef Mon Sep 17 00:00:00 2001
From: Jens Frank <jeluf@users.mediawiki.org>
Date: Sat, 5 Jun 2004 08:31:41 +0000
Subject: [PATCH] new function firstChar: Return first character (not byte) of
 a string

---
 languages/Language.php     | 20 +++++++-------------
 languages/LanguageUtf8.php |  7 +++++++
 2 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/languages/Language.php b/languages/Language.php
index 48594bb55b..b9d08d0d9f 100644
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1835,6 +1835,12 @@ class Language {
 		return $in;
 	}
 
+	function firstChar( $s ) {
+		# Get the first character of a string. In ASCII, return
+		# first byte of the string. UTF8 and others have to 
+		# overload this.
+		return $s[0];
+	}
 
 	function setAltEncoding() {
 		# Some languages may have an alternate char encoding option
@@ -1896,7 +1902,7 @@ class Language {
 			# Fall back to English if local list is incomplete
 			$raw =& Language::getMagicWords();
 		}
-        $rawEntry = $raw[$mw->mId];
+		$rawEntry = $raw[$mw->mId];
 		$mw->mCaseSensitive = $rawEntry[0];
 		$mw->mSynonyms = array_slice( $rawEntry, 1 );
 	}
@@ -1907,18 +1913,6 @@ class Language {
 		return "<em>$text</em>";
 	}
 
-	# returns additional Regex for the tokenizer. See LanguageFr.php for an example
-	function tokenizerRegex()
-	{
-		return "";
-	}
-
-	# Process the token generated from the tokenizer by the above regex. Return
-	# NULL if the token is unknown, and the text to be added to the output otherwise
-	function processToken( &$token , &$tokenStack)
-	{
-		return NULL;
-	}
 	
 	# Normally we use the plain ASCII digits. Some languages such as Arabic will
 	# want to output numbers using script-appropriate characters: override this
diff --git a/languages/LanguageUtf8.php b/languages/LanguageUtf8.php
index 85efec3798..805589dd2f 100644
--- a/languages/LanguageUtf8.php
+++ b/languages/LanguageUtf8.php
@@ -64,6 +64,13 @@ class LanguageUtf8 extends Language {
 
 		return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
 	}
+
+	function firstChar( $s ) {
+		preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
+		'[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})', $s, $matches);
+		
+		return isset( $matches[1] ) ? $matches[1] : "";
+	}
 }
 
 } # ifdef MEDIAWIKI
-- 
2.20.1