From de2d186f8e29d89f2ff8ebb87b8b9610c4e7d5ef Mon Sep 17 00:00:00 2001 From: Jens Frank Date: Sat, 5 Jun 2004 08:31:41 +0000 Subject: [PATCH] new function firstChar: Return first character (not byte) of a string --- languages/Language.php | 20 +++++++------------- languages/LanguageUtf8.php | 7 +++++++ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/languages/Language.php b/languages/Language.php index 48594bb55b..b9d08d0d9f 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1835,6 +1835,12 @@ class Language { return $in; } + function firstChar( $s ) { + # Get the first character of a string. In ASCII, return + # first byte of the string. UTF8 and others have to + # overload this. + return $s[0]; + } function setAltEncoding() { # Some languages may have an alternate char encoding option @@ -1896,7 +1902,7 @@ class Language { # Fall back to English if local list is incomplete $raw =& Language::getMagicWords(); } - $rawEntry = $raw[$mw->mId]; + $rawEntry = $raw[$mw->mId]; $mw->mCaseSensitive = $rawEntry[0]; $mw->mSynonyms = array_slice( $rawEntry, 1 ); } @@ -1907,18 +1913,6 @@ class Language { return "$text"; } - # returns additional Regex for the tokenizer. See LanguageFr.php for an example - function tokenizerRegex() - { - return ""; - } - - # Process the token generated from the tokenizer by the above regex. Return - # NULL if the token is unknown, and the text to be added to the output otherwise - function processToken( &$token , &$tokenStack) - { - return NULL; - } # Normally we use the plain ASCII digits. Some languages such as Arabic will # want to output numbers using script-appropriate characters: override this diff --git a/languages/LanguageUtf8.php b/languages/LanguageUtf8.php index 85efec3798..805589dd2f 100644 --- a/languages/LanguageUtf8.php +++ b/languages/LanguageUtf8.php @@ -64,6 +64,13 @@ class LanguageUtf8 extends Language { return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s ); } + + function firstChar( $s ) { + preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' . + '[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})', $s, $matches); + + return isset( $matches[1] ) ? $matches[1] : ""; + } } } # ifdef MEDIAWIKI -- 2.20.1