new function firstChar: Return first character (not byte) of a string

author Jens Frank <jeluf@users.mediawiki.org>

Sat, 5 Jun 2004 08:31:41 +0000 (08:31 +0000)

committer Jens Frank <jeluf@users.mediawiki.org>

Sat, 5 Jun 2004 08:31:41 +0000 (08:31 +0000)
author Jens Frank <jeluf@users.mediawiki.org>
Sat, 5 Jun 2004 08:31:41 +0000 (08:31 +0000)
committer Jens Frank <jeluf@users.mediawiki.org>
Sat, 5 Jun 2004 08:31:41 +0000 (08:31 +0000)
diff --git a/languages/Language.php b/languages/Language.php

index 48594bb..b9d08d0 100644 (file)
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1835,6 +1835,12 @@ class Language {
                 return $in;
         }
  
+       function firstChar( $s ) {
+               # Get the first character of a string. In ASCII, return
+               # first byte of the string. UTF8 and others have to 
+               # overload this.
+               return $s[0];
+       }
  
         function setAltEncoding() {
                 # Some languages may have an alternate char encoding option
@@ -1896,7 +1902,7 @@ class Language {
                         # Fall back to English if local list is incomplete
                         $raw =& Language::getMagicWords();
                 }
-        $rawEntry = $raw[$mw->mId];
+               $rawEntry = $raw[$mw->mId];
                 $mw->mCaseSensitive = $rawEntry[0];
                 $mw->mSynonyms = array_slice( $rawEntry, 1 );
         }
@@ -1907,18 +1913,6 @@ class Language {
                 return "<em>$text</em>";
         }
  
-       # returns additional Regex for the tokenizer. See LanguageFr.php for an example
-       function tokenizerRegex()
-       {
-               return "";
-       }
-
-       # Process the token generated from the tokenizer by the above regex. Return
-       # NULL if the token is unknown, and the text to be added to the output otherwise
-       function processToken( &$token , &$tokenStack)
-       {
-               return NULL;
-       }
         
         # Normally we use the plain ASCII digits. Some languages such as Arabic will
         # want to output numbers using script-appropriate characters: override this
diff --git a/languages/LanguageUtf8.php b/languages/LanguageUtf8.php

index 85efec3..805589d 100644 (file)
--- a/languages/LanguageUtf8.php
+++ b/languages/LanguageUtf8.php
@@ -64,6 +64,13 @@ class LanguageUtf8 extends Language {
  
                 return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
         }
+
+       function firstChar( $s ) {
+               preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
+               '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})', $s, $matches);
+               
+               return isset( $matches[1] ) ? $matches[1] : "";
+       }
  }
  
  } # ifdef MEDIAWIKI
author	Jens Frank <jeluf@users.mediawiki.org>
	Sat, 5 Jun 2004 08:31:41 +0000 (08:31 +0000)
committer	Jens Frank <jeluf@users.mediawiki.org>
	Sat, 5 Jun 2004 08:31:41 +0000 (08:31 +0000)
languages/Language.php		patch \| blob \| history
languages/LanguageUtf8.php		patch \| blob \| history