includes/Tokenizer.php

   1 <?php
   2 class Tokenizer {
   3         /* private */ var $mText,               # Text to be processed by the tokenizer
   4                           $mPos,                # current position of tokenizer in text
   5                           $mTextLength,         # Length of $mText
   6                           $mQueuedToken;        # Tokens that were already found, but not
   7                                                 # returned yet.
   8
   9         /* private */ function Tokenizer()
  10         {
  11                 global $wgLang;
  12
  13                 $this->mPos=0;
  14                 $this->mTokenQueue=array();
  15                 $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
  16         }
  17
  18         # factory function
  19         function newFromString( $s )
  20         {
  21                 $fname = "Tokenizer::newFromString";
  22                 wfProfileIn( $fname );
  23
  24                 $t = new Tokenizer();
  25                 $t->mText = $s;
  26                 $t->mTextLength = strlen( $s );
  27
  28                 wfProfileOut( $fname );
  29                 return $t;
  30         }
  31
  32
  33         // Return the next token, but do not increase the pointer. The next call
  34         // to previewToken or nextToken will return the same token again.
  35         // Actually, the pointer is increased, but the token is queued. The next
  36         // call to previewToken or nextToken will check the queue and return
  37         // the stored token.
  38         function previewToken()
  39         {
  40                 $fname = "Tokenizer::previewToken";
  41                 wfProfileIn( $fname );
  42
  43                 if ( count( $this->mQueuedToken ) != 0 ) {
  44                         // still one token from the last round around. Return that one first.
  45                         $token = $this->mQueuedToken[0];
  46                 } else {
  47                         $token = $this->nextToken();
  48                         array_unshift( $this->mQueuedToken, $token );
  49                 }
  50
  51                 wfProfileOut( $fname );
  52                 return $token;
  53         }
  54
  55
  56         // get the next token
  57         // proceeds character by character through the text, looking for characters needing
  58         // special attention. Those are currently: I, R, ', [, ], newline
  59         //
  60         // TODO:  handling of French blanks not yet implemented
  61         function nextToken()
  62         {
  63                 $fname = "Tokenizer::nextToken";
  64                 wfProfileIn( $fname );
  65
  66                 if ( count( $this->mQueuedToken ) != 0 ) {
  67                         // still one token from the last round around. Return that one first.
  68                         $token = array_shift( $this->mQueuedToken );
  69                 } else if ( $this->mPos > $this->mTextLength ) {
  70                         // If no text is left, return "false".
  71                         $token = false;
  72                 } else {
  73
  74                         $token["text"]="";
  75                         $token["type"]="text";
  76
  77                         while ( $this->mPos <= $this->mTextLength ) {
  78                                 switch ( @$ch = $this->mText[$this->mPos] ) {
  79                                         case 'R': // for "RFC "
  80                                                 if ( $this->continues("FC ") ) {
  81                                                         $queueToken["type"] = $queueToken["text"] = "RFC ";
  82                                                         $this->mQueuedToken[] = $queueToken;
  83                                                         $this->mPos += 3;
  84                                                         break 2; // switch + while
  85                                                 }
  86                                                 break;
  87                                         case 'I': // for "ISBN "
  88                                                 if ( $this->continues("SBN ") ) {
  89                                                         $queueToken["type"] = $queueToken["text"] = "ISBN ";
  90                                                         $this->mQueuedToken[] = $queueToken;
  91                                                         $this->mPos += 4;
  92                                                         break 2; // switch + while
  93                                                 }
  94                                                 break;
  95                                         case "[": // for links "[["
  96                                                 if ( $this->continues("[[") ) {
  97                                                         $queueToken["type"] = "[[[";
  98                                                         $queueToken["text"] = "";
  99                                                         $this->mQueuedToken[] = $queueToken;
 100                                                         $this->mPos += 3;
 101                                                         break 2; // switch + while
 102                                                 } else if ( $this->continues("[") ) {
 103                                                         $queueToken["type"] = "[[";
 104                                                         $queueToken["text"] = "";
 105                                                         // Check for a "prefixed link", e.g. Al[[Khazar]]
 106                                                         // Mostly for arabic wikipedia
 107                                                         if ( $this->linkPrefixExtension ) {
 108                                                                 while (    $this->linkPrefixExtension
 109                                                                         && ($len = strlen( $token["text"] ) ) > 0
 110                                                                         && !ctype_space( $token["text"][$len-1] ) )
 111                                                                 {
 112                                                                         //prepend the character to the link's open tag
 113                                                                         $queueToken["text"] = $token["text"][$len-1] . $queueToken["text"];
 114                                                                         //remove character from the end of the text token
 115                                                                         $token["text"] = substr( $token["text"], 0, -1);
 116                                                                 }
 117                                                         }
 118                                                         $this->mQueuedToken[] = $queueToken;
 119                                                         $this->mPos += 2;
 120                                                         break 2; // switch + while
 121                                                 }
 122                                                 break;
 123                                         case "]": // for end of links "]]"
 124                                                 if ( $this->continues("]") ) {
 125                                                         $queueToken["type"] = "]]";
 126                                                         $queueToken["text"] = "";
 127                                                         $this->mQueuedToken[] = $queueToken;
 128                                                         $this->mPos += 2;
 129                                                         break 2; // switch + while
 130                                                 }
 131                                                 break;
 132                                         case "'": // for all kind of em's and strong's
 133                                                 if ( $this->continues("'") ) {
 134                                                         $queueToken["type"] = "'";
 135                                                         $queueToken["text"] = "";
 136                                                         while(   ($this->mPos+1 < $this->mTextLength)
 137                                                                && $this->mText[$this->mPos+1] == "'" )
 138                                                         {
 139                                                                 $queueToken["type"] .= "'";
 140                                                                 $queueToken["pos"] = $this->mPos;
 141                                                                 $this->mPos ++;
 142                                                         }
 143
 144                                                         $this->mQueuedToken[] = $queueToken;
 145                                                         $this->mPos ++;
 146                                                         break 2; // switch + while
 147                                                 }
 148                                                 break;
 149                                         case "\n": // for block levels, actually, only "----" is handled.
 150                                         case "\r": // headings are detected to close any unbalanced em or strong tags in a section
 151                                                 if ( $this->continues( "----" ) )
 152                                                 {
 153                                                         $queueToken["type"] = "----";
 154                                                         $queueToken["text"] = "";
 155                                                         $this->mQueuedToken[] = $queueToken;
 156                                                         $this->mPos += 5;
 157                                                         while (     $this->mPos<$this->mTextLength
 158                                                                 and $this->mText[$this->mPos] == "-" )
 159                                                         {
 160                                                                 $this->mPos ++;
 161                                                         }
 162                                                         break 2;
 163                                                 } else if (
 164                                                         $this->continues( "<h" ) and (
 165                                                                 $this->continues( "<h1" ) or
 166                                                                 $this->continues( "<h2" ) or
 167                                                                 $this->continues( "<h3" ) or
 168                                                                 $this->continues( "<h4" ) or
 169                                                                 $this->continues( "<h5" ) or
 170                                                                 $this->continues( "<h6" )
 171                                                         )
 172                                                 ) { // heading
 173                                                         $queueToken["type"] = "h";
 174                                                         $queueToken["text"] = "";
 175                                                         $this->mQueuedToken[] = $queueToken;
 176                                                         $this->mPos ++;
 177                                                         break 2; // switch + while
 178                                                 }
 179                                                 break;
 180                                         case "!": // French spacing rules have a space before exclamation
 181                                         case "?": // and question marks. Those have to become &nbsp;
 182                                         case ":": // And colons, Hashar says ...
 183                                                 if ( $this->preceeded( " " ) )
 184                                                 {
 185                                                         // strip blank from Token
 186                                                         $token["text"] = substr( $token["text"], 0, -1 );
 187                                                         $queueToken["type"] = "blank";
 188                                                         $queueToken["text"] = " {$ch}";
 189                                                         $this->mQueuedToken[] = $queueToken;
 190                                                         $this->mPos ++;
 191                                                         break 2; // switch + while
 192                                                 }
 193                                                 break;
 194                                         case "0": // A space between two numbers is used to ease reading
 195                                         case "1": // of big numbers, e.g. 1 000 000. Those spaces need
 196                                         case "2": // to be unbreakable
 197                                         case "3":
 198                                         case "4":
 199                                         case "5":
 200                                         case "6":
 201                                         case "7":
 202                                         case "8":
 203                                         case "9":
 204                                                 if (    ($this->mTextLength >= $this->mPos +2)
 205                                                      && ($this->mText[$this->mPos+1] == " ")
 206                                                      && ctype_digit( $this->mText[$this->mPos+2] ) )
 207                                                 {
 208                                                         $queueToken["type"] = "blank";
 209                                                         $queueToken["text"] = $ch . " ";
 210                                                         $this->mQueuedToken[] = $queueToken;
 211                                                         $this->mPos += 2;
 212                                                         break 2; // switch + while
 213                                                 }
 214                                                 break;
 215                                         case "\302": // first byte of UTF-8 Character Guillemet-left
 216                                                 if ( $this->continues( "\253 ") ) // second byte and a blank
 217                                                 {
 218                                                         $queueToken["type"] = "blank";
 219                                                         $queueToken["text"] = "\302\253 ";
 220                                                         $this->mQueuedToken[] = $queueToken;
 221                                                         $this->mPos += 3;
 222                                                         break 2; // switch + while
 223                                                 }
 224                                                 break;
 225                                         case "\273": //last byte of UTF-8 Character Guillemet-right
 226                                                 if ( $this->preceeded( " \302" ) )
 227                                                 {
 228                                                         $queueToken["type"] = "blank";
 229                                                         $queueToken["text"] = " \302\273";
 230                                                         $token["text"] = substr( $token["text"], 0, -2 );
 231                                                         $this->mQueuedToken[] = $queueToken;
 232                                                         $this->mPos ++;
 233                                                         break 2; // switch + while
 234                                                 }
 235                                                 break;
 236                                         case "&": //extensions like <timeline>, since HTML stripping has already been done,
 237                                                   //those look like &lt;timeline&gt;
 238                                                 if ( $this->continues( "lt;timeline&gt;" ) )
 239                                                 {
 240                                                         $queueToken["type"] = "<timeline>";
 241                                                         $queueToken["text"] = "&lt;timeline&gt;";
 242                                                         $this->mQueuedToken[] = $queueToken;
 243                                                         $this->mPos += 16;
 244                                                         break 2; // switch + while
 245                                                 }
 246                                                 break;
 247
 248                                 } /* switch */
 249                                 $token["text"].=$ch;
 250                                 $this->mPos ++;
 251                                 // echo $this->mPos . "<br>\n";
 252                         } /* while */
 253                 } /* if (nothing left in queue) */
 254
 255                 wfProfileOut( $fname );
 256                 return $token;
 257         }
 258
 259         // function continues
 260         // checks whether the mText continues with $cont from mPos+1
 261         /* private */ function continues( $cont )
 262         {
 263                 // If string is not long enough to contain $cont, return false
 264                 if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
 265                         return false;
 266                 for ( $i=0; $i < strlen( $cont ); $i++ )
 267                 {
 268                         if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
 269                                 return false;
 270                 }
 271                 return true;
 272         }
 273
 274         // function preceeded
 275         // checks whether the mText is preceeded by $prec at position mPos
 276         /* private */ function preceeded( $prec )
 277         {
 278                 $len = strlen( $prec );
 279                 // if $prec is longer than the text up to mPos, return false
 280                 if ( $this->mPos < $len )
 281                         return false;
 282                 return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
 283         }
 284
 285         function readAllUntil( $border )
 286         {
 287                 $n = strpos( $this->mText, $border, $this->mPos );
 288                 if ( $n === false )
 289                         return "";
 290                 $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
 291                 $this->mPos = $n + strlen( $border ) + 1;
 292                 return $ret;
 293         }
 294
 295 }
 296