mPos=0; $this->mTokenQueue=array(); $this->linkPrefixExtension = $wgLang->linkPrefixExtension(); } # factory function function newFromString( $s ) { $fname = "Tokenizer::newFromString"; wfProfileIn( $fname ); $t = new Tokenizer(); $t->mText = $s; $t->mTextLength = strlen( $s ); wfProfileOut( $fname ); return $t; } // Return the next token, but do not increase the pointer. The next call // to previewToken or nextToken will return the same token again. // Actually, the pointer is increased, but the token is queued. The next // call to previewToken or nextToken will check the queue and return // the stored token. function previewToken() { $fname = "Tokenizer::previewToken"; wfProfileIn( $fname ); if ( count( $this->mQueuedToken ) != 0 ) { // still one token from the last round around. Return that one first. $token = $this->mQueuedToken[0]; } else { $token = $this->nextToken(); array_unshift( $this->mQueuedToken, $token ); } wfProfileOut( $fname ); return $token; } // get the next token // proceeds character by character through the text, looking for characters needing // special attention. Those are currently: I, R, ', [, ], newline // // TODO: handling of French blanks not yet implemented function nextToken() { $fname = "Tokenizer::nextToken"; wfProfileIn( $fname ); if ( count( $this->mQueuedToken ) != 0 ) { // still one token from the last round around. Return that one first. $token = array_shift( $this->mQueuedToken ); } else if ( $this->mPos > $this->mTextLength ) { // If no text is left, return "false". $token = false; } else { $token["text"]=""; $token["type"]="text"; while ( $this->mPos <= $this->mTextLength ) { switch ( @$ch = $this->mText[$this->mPos] ) { case 'R': // for "RFC " if ( $this->continues("FC ") ) { $queueToken["type"] = $queueToken["text"] = "RFC "; $this->mQueuedToken[] = $queueToken; $this->mPos += 3; break 2; // switch + while } break; case 'I': // for "ISBN " if ( $this->continues("SBN ") ) { $queueToken["type"] = $queueToken["text"] = "ISBN "; $this->mQueuedToken[] = $queueToken; $this->mPos += 4; break 2; // switch + while } break; case "[": // for links "[[" if ( $this->continues("[[") ) { $queueToken["type"] = "[[["; $queueToken["text"] = ""; $this->mQueuedToken[] = $queueToken; $this->mPos += 3; break 2; // switch + while } else if ( $this->continues("[") ) { $queueToken["type"] = "[["; $queueToken["text"] = ""; // Check for a "prefixed link", e.g. Al[[Khazar]] // Mostly for arabic wikipedia if ( $this->linkPrefixExtension ) { while ( $this->linkPrefixExtension && ($len = strlen( $token["text"] ) ) > 0 && !ctype_space( $token["text"][$len-1] ) ) { //prepend the character to the link's open tag $queueToken["text"] = $token["text"][$len-1] . $queueToken["text"]; //remove character from the end of the text token $token["text"] = substr( $token["text"], 0, -1); } } $this->mQueuedToken[] = $queueToken; $this->mPos += 2; break 2; // switch + while } break; case "]": // for end of links "]]" if ( $this->continues("]") ) { $queueToken["type"] = "]]"; $queueToken["text"] = ""; $this->mQueuedToken[] = $queueToken; $this->mPos += 2; break 2; // switch + while } break; case "'": // for all kind of em's and strong's if ( $this->continues("'") ) { $queueToken["type"] = "'"; $queueToken["text"] = ""; while( ($this->mPos+1 < $this->mTextLength) && $this->mText[$this->mPos+1] == "'" ) { $queueToken["type"] .= "'"; $this->mPos ++; } $this->mQueuedToken[] = $queueToken; $this->mPos ++; break 2; // switch + while } break; case "\n": // for block levels, actually, only "----" is handled. case "\r": if ( $this->continues( "----" ) ) { $queueToken["type"] = "----"; $queueToken["text"] = ""; $this->mQueuedToken[] = $queueToken; $this->mPos += 5; while ( $this->mPos<$this->mTextLength and $this->mText[$this->mPos] == "-" ) { $this->mPos ++; } break 2; } break; case "!": // French spacing rules have a space before exclamation case "?": // and question marks. Those have to become   case ":": // And colons, Hashar says ... if ( $this->preceeded( " " ) ) { // strip blank from Token $token["text"] = substr( $token["text"], 0, -1 ); $queueToken["type"] = "blank"; $queueToken["text"] = " {$ch}"; $this->mQueuedToken[] = $queueToken; $this->mPos ++; break 2; // switch + while } break; case "0": // A space between two numbers is used to ease reading case "1": // of big numbers, e.g. 1 000 000. Those spaces need case "2": // to be unbreakable case "3": case "4": case "5": case "6": case "7": case "8": case "9": if ( ($this->mTextLength >= $this->mPos +2) && ($this->mText[$this->mPos+1] == " ") && ctype_digit( $this->mText[$this->mPos+2] ) ) { $queueToken["type"] = "blank"; $queueToken["text"] = $ch . " "; $this->mQueuedToken[] = $queueToken; $this->mPos += 2; break 2; // switch + while } break; case "\302": // first byte of UTF-8 Character Guillemet-left if ( $this->continues( "\253 ") ) // second byte and a blank { $queueToken["type"] = "blank"; $queueToken["text"] = "\302\253 "; $this->mQueuedToken[] = $queueToken; $this->mPos += 3; break 2; // switch + while } break; case "\273": //last byte of UTF-8 Character Guillemet-right if ( $this->preceeded( " \302" ) ) { $queueToken["type"] = "blank"; $queueToken["text"] = " \302\273"; $token["text"] = substr( $token["text"], 0, -2 ); $this->mQueuedToken[] = $queueToken; $this->mPos ++; break 2; // switch + while } break; } /* switch */ $token["text"].=$ch; $this->mPos ++; // echo $this->mPos . "
\n"; } /* while */ } /* if (nothing left in queue) */ wfProfileOut( $fname ); return $token; } // function continues // checks whether the mText continues with $cont from mPos+1 /* private */ function continues( $cont ) { // If string is not long enough to contain $cont, return false if ( $this->mTextLength < $this->mPos + strlen( $cont ) ) return false; for ( $i=0; $i < strlen( $cont ); $i++ ) { if ( $this->mText[$this->mPos+1+$i] != $cont[$i] ) return false; } return true; } // function preceeded // checks whether the mText is preceeded by $prec at position mPos /* private */ function preceeded( $prec ) { $len = strlen( $prec ); // if $prec is longer than the text up to mPos, return false if ( $this->mPos < $len ) return false; return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) ); } }