/* private */ var $mText, # Text to be processed by the tokenizer
$mPos, # current position of tokenizer in text
$mTextLength, # Length of $mText
- $mCount, # token count, computed in preParse
- $mMatch, # matches of tokenizer regex, computed in preParse
- $mMatchPos; # current token position of tokenizer. Each match can
- # be up to two tokens: A matched token and the text after it.
+ $mQueuedToken; # Tokens that were already found, but not
+ # returned yet.
/* private */ function Tokenizer()
{
+ global $wgLang;
+
$this->mPos=0;
+ $this->mTokenQueue=array();
+ $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
}
# factory function
function newFromString( $s )
{
+ $fname = "Tokenizer::newFromString";
+ wfProfileIn( $fname );
+
$t = new Tokenizer();
$t->mText = $s;
- $t->preParse();
$t->mTextLength = strlen( $s );
+
+ wfProfileOut( $fname );
return $t;
}
- function preParse()
- {
- $this->mCount = preg_match_all( "/(\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/",
- $this->mText, $this->mMatch,
- PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE);
- $this->mMatchPos=0;
- }
- function nextToken()
+ // Return the next token, but do not increase the pointer. The next call
+ // to previewToken or nextToken will return the same token again.
+ // Actually, the pointer is increased, but the token is queued. The next
+ // call to previewToken or nextToken will check the queue and return
+ // the stored token.
+ function previewToken()
{
- $token = $this->previewToken();
- if ( $token ) {
- if ( $token["type"] == "text" ) {
- $this->mPos = $token["mPos"];
- } else {
- $this->mMatchPos = $token["mMatchPos"];
- $this->mPos = $token["mPos"];
- }
+ $fname = "Tokenizer::previewToken";
+ wfProfileIn( $fname );
+
+ if ( count( $this->mQueuedToken ) != 0 ) {
+ // still one token from the last round around. Return that one first.
+ $token = $this->mQueuedToken[0];
+ } else {
+ $token = $this->nextToken();
+ array_unshift( $this->mQueuedToken, $token );
}
+
+ wfProfileOut( $fname );
return $token;
}
- function previewToken()
+ // get the next token
+ // proceeds character by character through the text, looking for characters needing
+ // special attention. Those are currently: I, R, ', [, ], newline
+ //
+ // TODO: handling of French blanks not yet implemented
+ function nextToken()
{
- if ( $this->mMatchPos <= $this->mCount ) {
- $token["pos"] = $this->mPos;
- if ( $this->mPos < $this->mMatch[0][$this->mMatchPos][1] ) {
- $token["type"] = "text";
- $token["text"] = substr( $this->mText, $this->mPos,
- $this->mMatch[0][$this->mMatchPos][1] - $this->mPos );
- $token["mPos"] = $this->mMatch[0][$this->mMatchPos][1];
- } else {
- $token["type"] = $this->mMatch[0][$this->mMatchPos][0];
- $token["mPos"] = $this->mPos + strlen($token["type"]);
- $token["mMatchPos"] = $this->mMatchPos + 1;
- }
- } elseif ( $this->mPos < $this->mTextLength ) {
- $token["type"] = "text";
- $token["text"] = substr( $this->mText, $this->mPos );
- $token["mPos"] = $this->mTextLength;
+ $fname = "Tokenizer::nextToken";
+ wfProfileIn( $fname );
+
+ if ( count( $this->mQueuedToken ) != 0 ) {
+ // still one token from the last round around. Return that one first.
+ $token = array_shift( $this->mQueuedToken );
+ } else if ( $this->mPos > $this->mTextLength ) {
+ // If no text is left, return "false".
+ $token = false;
} else {
- $token = FALSE;
- }
+
+ $token["text"]="";
+ $token["type"]="text";
+
+ while ( $this->mPos <= $this->mTextLength ) {
+ switch ( @$ch = $this->mText[$this->mPos] ) {
+ case 'R': // for "RFC "
+ if ( $this->continues("FC ") ) {
+ $queueToken["type"] = $queueToken["text"] = "RFC ";
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 3;
+ break 2; // switch + while
+ }
+ break;
+ case 'I': // for "ISBN "
+ if ( $this->continues("SBN ") ) {
+ $queueToken["type"] = $queueToken["text"] = "ISBN ";
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 4;
+ break 2; // switch + while
+ }
+ break;
+ case "[": // for links "[["
+ if ( $this->continues("[[") ) {
+ $queueToken["type"] = "[[[";
+ $queueToken["text"] = "";
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 3;
+ break 2; // switch + while
+ } else if ( $this->continues("[") ) {
+ $queueToken["type"] = "[[";
+ $queueToken["text"] = "";
+ // Check for a "prefixed link", e.g. Al[[Khazar]]
+ // Mostly for arabic wikipedia
+ if ( $this->linkPrefixExtension ) {
+ while ( $this->linkPrefixExtension
+ && ($len = strlen( $token["text"] ) ) > 0
+ && !ctype_space( $token["text"][$len-1] ) )
+ {
+ //prepend the character to the link's open tag
+ $queueToken["text"] = $token["text"][$len-1] . $queueToken["text"];
+ //remove character from the end of the text token
+ $token["text"] = substr( $token["text"], 0, -1);
+ }
+ }
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 2;
+ break 2; // switch + while
+ }
+ break;
+ case "]": // for end of links "]]"
+ if ( $this->continues("]") ) {
+ $queueToken["type"] = "]]";
+ $queueToken["text"] = "";
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 2;
+ break 2; // switch + while
+ }
+ break;
+ case "'": // for all kind of em's and strong's
+ if ( $this->continues("'") ) {
+ $queueToken["type"] = "'";
+ $queueToken["text"] = "";
+ while( ($this->mPos+1 < $this->mTextLength)
+ && $this->mText[$this->mPos+1] == "'" )
+ {
+ $queueToken["type"] .= "'";
+ $queueToken["pos"] = $this->mPos;
+ $this->mPos ++;
+ }
+
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos ++;
+ break 2; // switch + while
+ }
+ break;
+ case "\n": // for block levels, actually, only "----" is handled.
+ case "\r":
+ if ( $this->continues( "----" ) )
+ {
+ $queueToken["type"] = "----";
+ $queueToken["text"] = "";
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 5;
+ while ( $this->mPos<$this->mTextLength
+ and $this->mText[$this->mPos] == "-" )
+ {
+ $this->mPos ++;
+ }
+ break 2;
+ }
+ break;
+ case "!": // French spacing rules have a space before exclamation
+ case "?": // and question marks. Those have to become
+ case ":": // And colons, Hashar says ...
+ if ( $this->preceeded( " " ) )
+ {
+ // strip blank from Token
+ $token["text"] = substr( $token["text"], 0, -1 );
+ $queueToken["type"] = "blank";
+ $queueToken["text"] = " {$ch}";
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos ++;
+ break 2; // switch + while
+ }
+ break;
+ case "0": // A space between two numbers is used to ease reading
+ case "1": // of big numbers, e.g. 1 000 000. Those spaces need
+ case "2": // to be unbreakable
+ case "3":
+ case "4":
+ case "5":
+ case "6":
+ case "7":
+ case "8":
+ case "9":
+ if ( ($this->mTextLength >= $this->mPos +2)
+ && ($this->mText[$this->mPos+1] == " ")
+ && ctype_digit( $this->mText[$this->mPos+2] ) )
+ {
+ $queueToken["type"] = "blank";
+ $queueToken["text"] = $ch . " ";
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 2;
+ break 2; // switch + while
+ }
+ break;
+ case "\302": // first byte of UTF-8 Character Guillemet-left
+ if ( $this->continues( "\253 ") ) // second byte and a blank
+ {
+ $queueToken["type"] = "blank";
+ $queueToken["text"] = "\302\253 ";
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 3;
+ break 2; // switch + while
+ }
+ break;
+ case "\273": //last byte of UTF-8 Character Guillemet-right
+ if ( $this->preceeded( " \302" ) )
+ {
+ $queueToken["type"] = "blank";
+ $queueToken["text"] = " \302\273";
+ $token["text"] = substr( $token["text"], 0, -2 );
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos ++;
+ break 2; // switch + while
+ }
+ break;
+ case "&": //extensions like <timeline>, since HTML stripping has already been done,
+ //those look like <timeline>
+ if ( $this->continues( "lt;timeline>" ) )
+ {
+ $queueToken["type"] = "<timeline>";
+ $queueToken["text"] = "<timeline>";
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 16;
+ break 2; // switch + while
+ }
+ break;
+
+ } /* switch */
+ $token["text"].=$ch;
+ $this->mPos ++;
+ // echo $this->mPos . "<br>\n";
+ } /* while */
+ } /* if (nothing left in queue) */
+
+ wfProfileOut( $fname );
return $token;
}
-
+ // function continues
+ // checks whether the mText continues with $cont from mPos+1
+ /* private */ function continues( $cont )
+ {
+ // If string is not long enough to contain $cont, return false
+ if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
+ return false;
+ for ( $i=0; $i < strlen( $cont ); $i++ )
+ {
+ if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
+ return false;
+ }
+ return true;
+ }
+
+ // function preceeded
+ // checks whether the mText is preceeded by $prec at position mPos
+ /* private */ function preceeded( $prec )
+ {
+ $len = strlen( $prec );
+ // if $prec is longer than the text up to mPos, return false
+ if ( $this->mPos < $len )
+ return false;
+ return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
+ }
+
+ function readAllUntil( $border )
+ {
+ $n = strpos( $this->mText, $border, $this->mPos );
+ if ( $n === false )
+ return "";
+ $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
+ $this->mPos = $n + strlen( $border ) + 1;
+ return $ret;
+ }
+
}