typo leading to fatal error.
[lhc/web/wiklou.git] / includes / Tokenizer.php
index eb98237..f23df7f 100644 (file)
@@ -8,17 +8,24 @@ class Tokenizer {
 
        /* private */ function Tokenizer()
        {
+               global $wgLang;
+
                $this->mPos=0;
                $this->mTokenQueue=array();
+               $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
        }
 
        # factory function
        function newFromString( $s )
        {
+               $fname = "Tokenizer::newFromString";
+               wfProfileIn( $fname );
+
                $t = new Tokenizer();
                $t->mText = $s;
                $t->mTextLength = strlen( $s );
-               // echo "New tokenizer generated. <pre>{$s}</pre>\n"; 
+
+               wfProfileOut( $fname );
                return $t;
        }
 
@@ -30,6 +37,9 @@ class Tokenizer {
        // the stored token.
        function previewToken()
        {
+               $fname = "Tokenizer::previewToken";
+               wfProfileIn( $fname );
+
                if ( count( $this->mQueuedToken ) != 0 ) {
                        // still one token from the last round around. Return that one first.
                        $token = $this->mQueuedToken[0];
@@ -37,6 +47,8 @@ class Tokenizer {
                        $token = $this->nextToken();
                        array_unshift( $this->mQueuedToken, $token );
                }
+
+               wfProfileOut( $fname );
                return $token;
        }
 
@@ -45,28 +57,27 @@ class Tokenizer {
        // proceeds character by character through the text, looking for characters needing
        // special attention. Those are currently: I, R, ', [, ], newline
        //
-       // TODO: prefixed links for Arabic wikipedia not implemented yet
-       //       handling of French blanks not yet implemented
+       // TODO:  handling of French blanks not yet implemented
        function nextToken()
        {
+               $fname = "Tokenizer::nextToken";
+               wfProfileIn( $fname );
+
                if ( count( $this->mQueuedToken ) != 0 ) {
                        // still one token from the last round around. Return that one first.
                        $token = array_shift( $this->mQueuedToken );
+               } else if ( $this->mPos > $this->mTextLength ) {
+                       // If no text is left, return "false".
+                       $token = false;
                } else {
 
                        $token["text"]="";
                        $token["type"]="text";
 
-                       // If no text is left, return "false".
-                       if ( $this->mPos > $this->mTextLength )
-                               return false;
-
                        while ( $this->mPos <= $this->mTextLength ) {
                                switch ( @$ch = $this->mText[$this->mPos] ) {
                                        case 'R': // for "RFC "
-                                               if ( $this->mText[$this->mPos+1] == 'F' &&
-                                               $this->mText[$this->mPos+2] == 'C' &&
-                                               $this->mText[$this->mPos+4] == ' ' ) {
+                                               if ( $this->continues("FC ") ) {
                                                        $queueToken["type"] = $queueToken["text"] = "RFC ";
                                                        $this->mQueuedToken[] = $queueToken;
                                                        $this->mPos += 3;
@@ -74,10 +85,7 @@ class Tokenizer {
                                                }
                                                break;
                                        case 'I': // for "ISBN "
-                                               if ( $this->mText[$this->mPos+1] == 'S' &&
-                                               $this->mText[$this->mPos+2] == 'B' &&
-                                               $this->mText[$this->mPos+3] == 'N' &&
-                                               $this->mText[$this->mPos+4] == ' ' ) {
+                                               if ( $this->continues("SBN ") ) {
                                                        $queueToken["type"] = $queueToken["text"] = "ISBN ";
                                                        $this->mQueuedToken[] = $queueToken;
                                                        $this->mPos += 4;
@@ -85,23 +93,35 @@ class Tokenizer {
                                                }
                                                break;
                                        case "[": // for links "[["
-                                               if ( $this->mText[$this->mPos+1] == "[" &&
-                                                    $this->mText[$this->mPos+2] == "[" ) {
+                                               if ( $this->continues("[[") ) {
                                                        $queueToken["type"] = "[[[";
                                                        $queueToken["text"] = "";
                                                        $this->mQueuedToken[] = $queueToken;
                                                        $this->mPos += 3;
                                                        break 2; // switch + while
-                                               } else if ( $this->mText[$this->mPos+1] == "[" ) {
-                                                       $queueToken["type"] = "[[";
+                                               } else if ( $this->continues("[") ) {
+                                                       $queueToken["type"] = "[[";
                                                        $queueToken["text"] = "";
+                                                       // Check for a "prefixed link", e.g. Al[[Khazar]]
+                                                       // Mostly for arabic wikipedia
+                                                       if ( $this->linkPrefixExtension ) {
+                                                               while (    $this->linkPrefixExtension
+                                                                       && ($len = strlen( $token["text"] ) ) > 0 
+                                                                       && !ctype_space( $token["text"][$len-1] ) )
+                                                               {
+                                                                       //prepend the character to the link's open tag
+                                                                       $queueToken["text"] = $token["text"][$len-1] . $queueToken["text"];
+                                                                       //remove character from the end of the text token
+                                                                       $token["text"] = substr( $token["text"], 0, -1);
+                                                               }
+                                                       }
                                                        $this->mQueuedToken[] = $queueToken;
                                                        $this->mPos += 2;
                                                        break 2; // switch + while 
                                                }
                                                break;
                                        case "]": // for end of links "]]"
-                                               if ( $this->mText[$this->mPos+1] == "]" ) {
+                                               if ( $this->continues("]") ) {
                                                        $queueToken["type"] = "]]";
                                                        $queueToken["text"] = "";
                                                        $this->mQueuedToken[] = $queueToken;
@@ -110,10 +130,12 @@ class Tokenizer {
                                                }
                                                break;
                                        case "'": // for all kind of em's and strong's
-                                               if ( $this->mText[$this->mPos+1] == "'" ) {
+                                               if ( $this->continues("'") ) {
                                                        $queueToken["type"] = "'";
                                                        $queueToken["text"] = "";
-                                                       while(isset($this->mText[$this->mPos+1]) && $this->mText[$this->mPos+1] == "'" ) {
+                                                       while(   ($this->mPos+1 < $this->mTextLength) 
+                                                              && $this->mText[$this->mPos+1] == "'" )
+                                                       {
                                                                $queueToken["type"] .= "'";
                                                                $this->mPos ++;
                                                        }
@@ -125,29 +147,112 @@ class Tokenizer {
                                                break;
                                        case "\n": // for block levels, actually, only "----" is handled.
                                        case "\r":
-                                               if ( isset($this->mText[$this->mPos+4]) &&
-                                                    $this->mText[$this->mPos+1] == "-" &&
-                                                    $this->mText[$this->mPos+2] == "-" &&
-                                                    $this->mText[$this->mPos+3] == "-" &&
-                                                    $this->mText[$this->mPos+4] == "-" ) {
+                                               if ( $this->continues( "----" ) )
+                                               {
                                                        $queueToken["type"] = "----";
                                                        $queueToken["text"] = "";
                                                        $this->mQueuedToken[] = $queueToken;
                                                        $this->mPos += 5;
-                                                       while (isset($this->mText[$this->mPos]) and $this->mText[$this->mPos] == "-" ) {
+                                                       while (     $this->mPos<$this->mTextLength 
+                                                               and $this->mText[$this->mPos] == "-" )
+                                                       {
                                                                $this->mPos ++;
                                                        }
                                                        break 2;
                                                }
+                                               break;
+                                       case "!": // French spacing rules have a space before exclamation
+                                       case "?": // and question marks. Those have to become &nbsp;
+                                       case ":": // And colons, Hashar says ...
+                                               if ( $this->preceeded( " " ) )
+                                               {
+                                                       // strip blank from Token
+                                                       $token["text"] = substr( $token["text"], 0, -1 );
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = " {$ch}";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos ++;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "0": // A space between two numbers is used to ease reading
+                                       case "1": // of big numbers, e.g. 1 000 000. Those spaces need
+                                       case "2": // to be unbreakable
+                                       case "3":
+                                       case "4":
+                                       case "5":
+                                       case "6":
+                                       case "7":
+                                       case "8":
+                                       case "9":
+                                               if (    ($this->mTextLength >= $this->mPos +2)
+                                                    && ($this->mText[$this->mPos+1] == " ")
+                                                    && ctype_digit( $this->mText[$this->mPos+2] ) )
+                                               {
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = $ch . " ";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 2;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "\302": // first byte of UTF-8 Character Guillemet-left
+                                               if ( $this->continues( "\253 ") ) // second byte and a blank
+                                               {
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = "\302\253 ";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 3;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "\273": //last byte of UTF-8 Character Guillemet-right
+                                               if ( $this->preceeded( " \302" ) )
+                                               {
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = " \302\273";
+                                                       $token["text"] = substr( $token["text"], 0, -2 );
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos ++;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+
                                } /* switch */
                                $token["text"].=$ch;
                                $this->mPos ++;
                                // echo $this->mPos . "<br>\n"; 
                        } /* while */
                } /* if (nothing left in queue) */
+       
+               wfProfileOut( $fname );
                return $token;
        }
 
-               
+       // function continues
+       // checks whether the mText continues with $cont from mPos+1
+       /* private */ function continues( $cont )
+       {
+               // If string is not long enough to contain $cont, return false
+               if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
+                       return false;
+               for ( $i=0; $i < strlen( $cont ); $i++ )
+               {
+                       if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
+                               return false;
+               }
+               return true;
+       }
+
+       // function preceeded
+       // checks whether the mText is preceeded by $prec at position mPos
+       /* private */ function preceeded( $prec )
+       {
+               $len = strlen( $prec );
+               // if $prec is longer than the text up to mPos, return false
+               if ( $this->mPos < $len )
+                       return false;
+               return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
+       }
 }