includes/Tokenizer.php

   1 <?php
   2 /**
   3  *
   4  */
   5
   6 /**
   7  *
   8  */
   9 class Tokenizer {
  10         /* private */ var $mText,               # Text to be processed by the tokenizer
  11                           $mPos,                # current position of tokenizer in text
  12                           $mTextLength,         # Length of $mText
  13                           $mQueuedToken;        # Tokens that were already found, but not
  14                                                 # returned yet.
  15
  16         /**
  17          * Constructor
  18          * @access private
  19          */
  20         function Tokenizer() {
  21                 global $wgLang;
  22
  23                 $this->mPos=0;
  24                 $this->mTokenQueue=array();
  25                 $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
  26         }
  27
  28         /**
  29          * factory function
  30          */
  31         function newFromString( $s ) {
  32                 $fname = 'Tokenizer::newFromString';
  33                 wfProfileIn( $fname );
  34
  35                 $t = new Tokenizer();
  36                 $t->mText = $s;
  37                 $t->mTextLength = strlen( $s );
  38
  39                 wfProfileOut( $fname );
  40                 return $t;
  41         }
  42
  43
  44         /**
  45          * Return the next token, but do not increase the pointer. The next call
  46          * to previewToken or nextToken will return the same token again.
  47          * Actually, the pointer is increased, but the token is queued. The next
  48          * call to previewToken or nextToken will check the queue and return
  49          * the stored token.
  50          */
  51         function previewToken() {
  52                 $fname = 'Tokenizer::previewToken';
  53                 wfProfileIn( $fname );
  54
  55                 if ( count( $this->mQueuedToken ) != 0 ) {
  56                         // still one token from the last round around. Return that one first.
  57                         $token = $this->mQueuedToken[0];
  58                 } else {
  59                         $token = $this->nextToken();
  60                         array_unshift( $this->mQueuedToken, $token );
  61                 }
  62
  63                 wfProfileOut( $fname );
  64                 return $token;
  65         }
  66
  67
  68         /**
  69          * Get the next token.
  70          *
  71          * proceeds character by character through the text, looking for characters needing
  72          * special attention. Those are currently: I, R, ', [, ], newline
  73          *
  74          * @todo handling of French blanks not yet implemented
  75          */
  76         function nextToken() {
  77                 $fname = 'Tokenizer::nextToken';
  78                 wfProfileIn( $fname );
  79
  80                 if ( count( $this->mQueuedToken ) != 0 ) {
  81                         // still one token from the last round around. Return that one first.
  82                         $token = array_shift( $this->mQueuedToken );
  83                 } else if ( $this->mPos > $this->mTextLength ) {
  84                         // If no text is left, return 'false'.
  85                         $token = false;
  86                 } else {
  87
  88                         $token['text']='';
  89                         $token['type']='text';
  90
  91                         while ( $this->mPos <= $this->mTextLength ) {
  92                                 switch ( @$ch = $this->mText[$this->mPos] ) {
  93                                         case 'R': // for "RFC "
  94                                                 if ( $this->continues('FC ') ) {
  95                                                         $queueToken['type'] = $queueToken['text'] = 'RFC ';
  96                                                         $this->mQueuedToken[] = $queueToken;
  97                                                         $this->mPos += 3;
  98                                                         break 2; // switch + while
  99                                                 }
 100                                                 break;
 101                                         case 'I': // for "ISBN "
 102                                                 if ( $this->continues('SBN ') ) {
 103                                                         $queueToken['type'] = $queueToken['text'] = 'ISBN ';
 104                                                         $this->mQueuedToken[] = $queueToken;
 105                                                         $this->mPos += 4;
 106                                                         break 2; // switch + while
 107                                                 }
 108                                                 break;
 109                                         case '[': // for links "[["
 110                                                 if ( $this->continues('[[') ) {
 111                                                         $queueToken['type'] = '[[[';
 112                                                         $queueToken['text'] = '';
 113                                                         $this->mQueuedToken[] = $queueToken;
 114                                                         $this->mPos += 3;
 115                                                         break 2; // switch + while
 116                                                 } else if ( $this->continues('[') ) {
 117                                                         $queueToken['type'] = '[[';
 118                                                         $queueToken['text'] = '';
 119                                                         // Check for a "prefixed link", e.g. Al[[Khazar]]
 120                                                         // Mostly for arabic wikipedia
 121                                                         if ( $this->linkPrefixExtension ) {
 122                                                                 while (    $this->linkPrefixExtension
 123                                                                         && ($len = strlen( $token['text'] ) ) > 0
 124                                                                         && !ctype_space( $token['text'][$len-1] ) )
 125                                                                 {
 126                                                                         //prepend the character to the link's open tag
 127                                                                         $queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
 128                                                                         //remove character from the end of the text token
 129                                                                         $token['text'] = substr( $token['text'], 0, -1);
 130                                                                 }
 131                                                         }
 132                                                         $this->mQueuedToken[] = $queueToken;
 133                                                         $this->mPos += 2;
 134                                                         break 2; // switch + while
 135                                                 }
 136                                                 break;
 137                                         case ']': // for end of links "]]"
 138                                                 if ( $this->continues(']') ) {
 139                                                         $queueToken['type'] = ']]';
 140                                                         $queueToken['text'] = '';
 141                                                         $this->mQueuedToken[] = $queueToken;
 142                                                         $this->mPos += 2;
 143                                                         break 2; // switch + while
 144                                                 }
 145                                                 break;
 146                                         case "'": // for all kind of em's and strong's
 147                                                 if ( $this->continues("'") ) {
 148                                                         $queueToken['type'] = "'";
 149                                                         $queueToken['text'] = '';
 150                                                         while(   ($this->mPos+1 < $this->mTextLength)
 151                                                                && $this->mText[$this->mPos+1] == "'" )
 152                                                         {
 153                                                                 $queueToken['type'] .= "'";
 154                                                                 $queueToken['pos'] = $this->mPos;
 155                                                                 $this->mPos ++;
 156                                                         }
 157
 158                                                         $this->mQueuedToken[] = $queueToken;
 159                                                         $this->mPos ++;
 160                                                         break 2; // switch + while
 161                                                 }
 162                                                 break;
 163                                         case "\n": // for block levels, actually, only "----" is handled.
 164                                         case "\r": // headings are detected to close any unbalanced em or strong tags in a section
 165                                                 if ( $this->continues( '----' ) )
 166                                                 {
 167                                                         $queueToken['type'] = '----';
 168                                                         $queueToken['text'] = '';
 169                                                         $this->mQueuedToken[] = $queueToken;
 170                                                         $this->mPos += 5;
 171                                                         while (     $this->mPos<$this->mTextLength
 172                                                                 and $this->mText[$this->mPos] == '-' )
 173                                                         {
 174                                                                 $this->mPos ++;
 175                                                         }
 176                                                         break 2;
 177                                                 } else if (
 178                                                         $this->continues( '<h' ) and (
 179                                                                 $this->continues( '<h1' ) or
 180                                                                 $this->continues( '<h2' ) or
 181                                                                 $this->continues( '<h3' ) or
 182                                                                 $this->continues( '<h4' ) or
 183                                                                 $this->continues( '<h5' ) or
 184                                                                 $this->continues( '<h6' )
 185                                                         )
 186                                                 ) { // heading
 187                                                         $queueToken['type'] = 'h';
 188                                                         $queueToken['text'] = '';
 189                                                         $this->mQueuedToken[] = $queueToken;
 190                                                         $this->mPos ++;
 191                                                         break 2; // switch + while
 192                                                 }
 193                                                 break;
 194                                         case '!': // French spacing rules have a space before exclamation
 195                                         case '?': // and question marks. Those have to become &nbsp;
 196                                         case ':': // And colons, Hashar says ...
 197                                                 if ( $this->preceeded( ' ' ) )
 198                                                 {
 199                                                         // strip blank from Token
 200                                                         $token['text'] = substr( $token['text'], 0, -1 );
 201                                                         $queueToken['type'] = 'blank';
 202                                                         $queueToken['text'] = ' '.$ch;
 203                                                         $this->mQueuedToken[] = $queueToken;
 204                                                         $this->mPos ++;
 205                                                         break 2; // switch + while
 206                                                 }
 207                                                 break;
 208                                         case '0': // A space between two numbers is used to ease reading
 209                                         case '1': // of big numbers, e.g. 1 000 000. Those spaces need
 210                                         case '2': // to be unbreakable
 211                                         case '3':
 212                                         case '4':
 213                                         case '5':
 214                                         case '6':
 215                                         case '7':
 216                                         case '8':
 217                                         case '9':
 218                                                 if (    ($this->mTextLength >= $this->mPos +2)
 219                                                      && ($this->mText[$this->mPos+1] == ' ')
 220                                                      && ctype_digit( $this->mText[$this->mPos+2] ) )
 221                                                 {
 222                                                         $queueToken['type'] = 'blank';
 223                                                         $queueToken['text'] = $ch . ' ';
 224                                                         $this->mQueuedToken[] = $queueToken;
 225                                                         $this->mPos += 2;
 226                                                         break 2; // switch + while
 227                                                 }
 228                                                 break;
 229                                         case "\302": // first byte of UTF-8 Character Guillemet-left
 230                                                 if ( $this->continues( "\253 ") ) // second byte and a blank
 231                                                 {
 232                                                         $queueToken['type'] = 'blank';
 233                                                         $queueToken['text'] = "\302\253 ";
 234                                                         $this->mQueuedToken[] = $queueToken;
 235                                                         $this->mPos += 3;
 236                                                         break 2; // switch + while
 237                                                 }
 238                                                 break;
 239                                         case "\273": //last byte of UTF-8 Character Guillemet-right
 240                                                 if ( $this->preceeded( " \302" ) )
 241                                                 {
 242                                                         $queueToken['type'] = 'blank';
 243                                                         $queueToken['text'] = " \302\273";
 244                                                         $token['text'] = substr( $token['text'], 0, -2 );
 245                                                         $this->mQueuedToken[] = $queueToken;
 246                                                         $this->mPos ++;
 247                                                         break 2; // switch + while
 248                                                 }
 249                                                 break;
 250                                         case '&': //extensions like <timeline>, since HTML stripping has already been done,
 251                                                   //those look like &lt;timeline&gt;
 252                                                 if ( $this->continues( 'lt;timeline&gt;' ) )
 253                                                 {
 254                                                         $queueToken['type'] = '<timeline>';
 255                                                         $queueToken['text'] = '&lt;timeline&gt;';
 256                                                         $this->mQueuedToken[] = $queueToken;
 257                                                         $this->mPos += 16;
 258                                                         break 2; // switch + while
 259                                                 }
 260                                                 break;
 261
 262                                 } /* switch */
 263                                 $token['text'].=$ch;
 264                                 $this->mPos ++;
 265                                 // echo $this->mPos . "<br>\n";
 266                         } /* while */
 267                 } /* if (nothing left in queue) */
 268
 269                 wfProfileOut( $fname );
 270                 return $token;
 271         }
 272
 273         /**
 274          * function continues
 275          *
 276          * checks whether the mText continues with $cont from mPos+1
 277          *
 278          * @access private
 279          */
 280         function continues( $cont ) {
 281                 // If string is not long enough to contain $cont, return false
 282                 if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
 283                         return false;
 284                 for ( $i=0; $i < strlen( $cont ); $i++ )
 285                 {
 286                         if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
 287                                 return false;
 288                 }
 289                 return true;
 290         }
 291
 292         /**
 293          * function preceeded
 294          *
 295          * checks whether the mText is preceeded by $prec at position mPos
 296          *
 297          * @access private
 298          */
 299         function preceeded( $prec ) {
 300                 $len = strlen( $prec );
 301                 // if $prec is longer than the text up to mPos, return false
 302                 if ( $this->mPos < $len )
 303                         return false;
 304                 return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
 305         }
 306
 307         /**
 308          *
 309          */
 310         function readAllUntil( $border ) {
 311                 $n = strpos( $this->mText, $border, $this->mPos );
 312                 if ( $n === false )
 313                         return '';
 314                 $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
 315                 $this->mPos = $n + strlen( $border ) + 1;
 316                 return $ret;
 317         }
 318
 319 }