Repaired rendering of ''emph ''' bold emph '' bold ''' to produce valid HTML.
[lhc/web/wiklou.git] / includes / Tokenizer.php
index e563a7c..0b313d5 100644 (file)
@@ -3,109 +3,279 @@ class Tokenizer {
        /* private */ var $mText,               # Text to be processed by the tokenizer
                          $mPos,                # current position of tokenizer in text
                          $mTextLength,         # Length of $mText
-                         $mCount,              # token count, computed in preParse
-                         $mMatch,              # matches of tokenizer regex, computed in preParse
-                         $mMatchPos;           # current token position of tokenizer. Each match can
-                                               # be up to two tokens: A matched token and the text after it.
+                         $mQueuedToken;        # Tokens that were already found, but not
+                                               # returned yet.
 
        /* private */ function Tokenizer()
        {
+               global $wgLang;
+
                $this->mPos=0;
+               $this->mTokenQueue=array();
+               $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
        }
 
        # factory function
        function newFromString( $s )
        {
+               $fname = "Tokenizer::newFromString";
+               wfProfileIn( $fname );
+
                $t = new Tokenizer();
                $t->mText = $s;
-               $t->preParse();
                $t->mTextLength = strlen( $s );
+
+               wfProfileOut( $fname );
                return $t;
        }
 
-       function preParse()
+
+       // Return the next token, but do not increase the pointer. The next call
+       // to previewToken or nextToken will return the same token again.
+       // Actually, the pointer is increased, but the token is queued. The next
+       // call to previewToken or nextToken will check the queue and return
+       // the stored token.
+       function previewToken()
        {
-               global $wgLang;
+               $fname = "Tokenizer::previewToken";
+               wfProfileIn( $fname );
 
-               # build up the regex, step by step.
-               # Basic features: Quotes for <em>/<strong> and hyphens for <hr>
-               $regex = "\'\'\'\'\'|\'\'\'|\'\'|\n-----*";
-               # Append regex for linkPrefixExtension 
-               if (  $wgLang->linkPrefixExtension() ) {
-                       $regex .= "|([a-zA-Z\x80-\xff]+)\[\[";
+               if ( count( $this->mQueuedToken ) != 0 ) {
+                       // still one token from the last round around. Return that one first.
+                       $token = $this->mQueuedToken[0];
                } else {
-                       $regex .= "|\[\[";
+                       $token = $this->nextToken();
+                       array_unshift( $this->mQueuedToken, $token );
                }
-               # Closing link
-               $regex .= "|\]\]";
-               # Magic words that automatically generate links
-               $regex .= "|ISBN |RFC ";
-               # Language-specific additions
-               $regex .= $wgLang->tokenizerRegex();
-               # Finalize regex
-               $regex = "/(" . $regex . ")/";
-
-               # Apply the regex to the text
-               $this->mCount = preg_match_all( $regex, $this->mText, $this->mMatch,
-                                               PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE);
-               $this->mMatchPos=0;
+
+               wfProfileOut( $fname );
+               return $token;
        }
 
+
+       // get the next token
+       // proceeds character by character through the text, looking for characters needing
+       // special attention. Those are currently: I, R, ', [, ], newline
+       //
+       // TODO:  handling of French blanks not yet implemented
        function nextToken()
        {
-               $token = $this->previewToken();
-               if ( $token ) {
-                       $this->mMatchPos = $token["mMatchPos"];
-                       $this->mPos = $token["mPos"];
-               }
+               $fname = "Tokenizer::nextToken";
+               wfProfileIn( $fname );
+
+               if ( count( $this->mQueuedToken ) != 0 ) {
+                       // still one token from the last round around. Return that one first.
+                       $token = array_shift( $this->mQueuedToken );
+               } else if ( $this->mPos > $this->mTextLength ) {
+                       // If no text is left, return "false".
+                       $token = false;
+               } else {
+
+                       $token["text"]="";
+                       $token["type"]="text";
+
+                       while ( $this->mPos <= $this->mTextLength ) {
+                               switch ( @$ch = $this->mText[$this->mPos] ) {
+                                       case 'R': // for "RFC "
+                                               if ( $this->continues("FC ") ) {
+                                                       $queueToken["type"] = $queueToken["text"] = "RFC ";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 3;
+                                                       break 2; // switch + while 
+                                               }
+                                               break;
+                                       case 'I': // for "ISBN "
+                                               if ( $this->continues("SBN ") ) {
+                                                       $queueToken["type"] = $queueToken["text"] = "ISBN ";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 4;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "[": // for links "[["
+                                               if ( $this->continues("[[") ) {
+                                                       $queueToken["type"] = "[[[";
+                                                       $queueToken["text"] = "";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 3;
+                                                       break 2; // switch + while
+                                               } else if ( $this->continues("[") ) {
+                                                       $queueToken["type"] = "[[";
+                                                       $queueToken["text"] = "";
+                                                       // Check for a "prefixed link", e.g. Al[[Khazar]]
+                                                       // Mostly for arabic wikipedia
+                                                       if ( $this->linkPrefixExtension ) {
+                                                               while (    $this->linkPrefixExtension
+                                                                       && ($len = strlen( $token["text"] ) ) > 0 
+                                                                       && !ctype_space( $token["text"][$len-1] ) )
+                                                               {
+                                                                       //prepend the character to the link's open tag
+                                                                       $queueToken["text"] = $token["text"][$len-1] . $queueToken["text"];
+                                                                       //remove character from the end of the text token
+                                                                       $token["text"] = substr( $token["text"], 0, -1);
+                                                               }
+                                                       }
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 2;
+                                                       break 2; // switch + while 
+                                               }
+                                               break;
+                                       case "]": // for end of links "]]"
+                                               if ( $this->continues("]") ) {
+                                                       $queueToken["type"] = "]]";
+                                                       $queueToken["text"] = "";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 2;
+                                                       break 2; // switch + while 
+                                               }
+                                               break;
+                                       case "'": // for all kind of em's and strong's
+                                               if ( $this->continues("'") ) {
+                                                       $queueToken["type"] = "'";
+                                                       $queueToken["text"] = "";
+                                                       while(   ($this->mPos+1 < $this->mTextLength) 
+                                                              && $this->mText[$this->mPos+1] == "'" )
+                                                       {
+                                                               $queueToken["type"] .= "'";
+                                                               $queueToken["pos"] = $this->mPos;
+                                                               $this->mPos ++;
+                                                       }
+                                                       
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos ++;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "\n": // for block levels, actually, only "----" is handled.
+                                       case "\r":
+                                               if ( $this->continues( "----" ) )
+                                               {
+                                                       $queueToken["type"] = "----";
+                                                       $queueToken["text"] = "";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 5;
+                                                       while (     $this->mPos<$this->mTextLength 
+                                                               and $this->mText[$this->mPos] == "-" )
+                                                       {
+                                                               $this->mPos ++;
+                                                       }
+                                                       break 2;
+                                               }
+                                               break;
+                                       case "!": // French spacing rules have a space before exclamation
+                                       case "?": // and question marks. Those have to become &nbsp;
+                                       case ":": // And colons, Hashar says ...
+                                               if ( $this->preceeded( " " ) )
+                                               {
+                                                       // strip blank from Token
+                                                       $token["text"] = substr( $token["text"], 0, -1 );
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = " {$ch}";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos ++;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "0": // A space between two numbers is used to ease reading
+                                       case "1": // of big numbers, e.g. 1 000 000. Those spaces need
+                                       case "2": // to be unbreakable
+                                       case "3":
+                                       case "4":
+                                       case "5":
+                                       case "6":
+                                       case "7":
+                                       case "8":
+                                       case "9":
+                                               if (    ($this->mTextLength >= $this->mPos +2)
+                                                    && ($this->mText[$this->mPos+1] == " ")
+                                                    && ctype_digit( $this->mText[$this->mPos+2] ) )
+                                               {
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = $ch . " ";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 2;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "\302": // first byte of UTF-8 Character Guillemet-left
+                                               if ( $this->continues( "\253 ") ) // second byte and a blank
+                                               {
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = "\302\253 ";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 3;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "\273": //last byte of UTF-8 Character Guillemet-right
+                                               if ( $this->preceeded( " \302" ) )
+                                               {
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = " \302\273";
+                                                       $token["text"] = substr( $token["text"], 0, -2 );
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos ++;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "&": //extensions like <timeline>, since HTML stripping has already been done, 
+                                                 //those look like &lt;timeline&gt;
+                                               if ( $this->continues( "lt;timeline&gt;" ) )
+                                               {
+                                                       $queueToken["type"] = "<timeline>";
+                                                       $queueToken["text"] = "&lt;timeline&gt;";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 16;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+
+                               } /* switch */
+                               $token["text"].=$ch;
+                               $this->mPos ++;
+                               // echo $this->mPos . "<br>\n"; 
+                       } /* while */
+               } /* if (nothing left in queue) */
+       
+               wfProfileOut( $fname );
                return $token;
        }
 
-
-       function previewToken()
+       // function continues
+       // checks whether the mText continues with $cont from mPos+1
+       /* private */ function continues( $cont )
        {
-               if ( $this->mMatchPos < $this->mCount  ) {
-                       $token["pos"] = $this->mPos;
-                       if ( $this->mPos < $this->mMatch[0][$this->mMatchPos][1] ) {
-                               $token["type"] = "text";
-                               $token["text"] = substr( $this->mText, $this->mPos,
-                                                        $this->mMatch[0][$this->mMatchPos][1] - $this->mPos );
-                               # What the pointers would change to if this would not just be a preview
-                               $token["mMatchPos"] = $this->mMatchPos; 
-                               $token["mPos"] = $this->mMatch[0][$this->mMatchPos][1];
-                       } else {
-                               # If linkPrefixExtension is set,  $this->mMatch[2][$this->mMatchPos][0]
-                               # contains the link prefix, or is null if no link prefix exist.
-                               if ( isset( $this->mMatch[2] ) && $this->mMatch[2][$this->mMatchPos][0] )
-                               {
-                                       # prefixed link open tag, [0] is "prefix[["
-                                       $token["type"] = "[[";
-                                       $token["text"] = $this->mMatch[2][$this->mMatchPos][0]; # the prefix
-                               } else {
-                                       $token["type"] = $this->mMatch[0][$this->mMatchPos][0];
-                                       if ( substr($token["type"],1,4) == "----" )
-                                       {
-                                               # any number of hyphens bigger than four is a <HR>. 
-                                               # strip down to four.
-                                               $token["type"]="----";
-                                       }
-                               }
-                               # What the pointers would change to if this would not just be a preview
-                               $token["mPos"] = $this->mPos + strlen( $this->mMatch[0][$this->mMatchPos][0] );
-                               $token["mMatchPos"] = $this->mMatchPos + 1;
-                       }
-               } elseif ( $this->mPos < $this->mTextLength ) {
-                       $token["type"] = "text";
-                       $token["text"] = substr( $this->mText, $this->mPos );
-                       # What the pointers would change to if this would not just be a preview
-                       $token["mPos"] = $this->mTextLength;
-                       $token["mMatchPos"] = $this->mMatchPos;
-               } else {
-                       $token = FALSE;
+               // If string is not long enough to contain $cont, return false
+               if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
+                       return false;
+               for ( $i=0; $i < strlen( $cont ); $i++ )
+               {
+                       if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
+                               return false;
                }
-               return $token;
+               return true;
+       }
+
+       // function preceeded
+       // checks whether the mText is preceeded by $prec at position mPos
+       /* private */ function preceeded( $prec )
+       {
+               $len = strlen( $prec );
+               // if $prec is longer than the text up to mPos, return false
+               if ( $this->mPos < $len )
+                       return false;
+               return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
+       }
+
+       function readAllUntil( $border )
+       {
+               $n = strpos( $this->mText, $border, $this->mPos );
+               if ( $n === false )
+                       return "";
+               $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
+               $this->mPos = $n + strlen( $border ) + 1;
+               return $ret;
        }
 
-               
 }