From: Jens Frank Date: Sat, 10 Apr 2004 07:53:52 +0000 (+0000) Subject: Rewrite of Tokenizer. Not using regex any more. X-Git-Tag: 1.3.0beta1~482 X-Git-Url: http://git.cyclocoop.org/%24self?a=commitdiff_plain;h=f9164a40fb4afeee962be37cfe8f1166fc6547f7;p=lhc%2Fweb%2Fwiklou.git Rewrite of Tokenizer. Not using regex any more. TODO: prefixed links for Arabic wikipedia not implemented yet handling of French blanks not yet implemented --- diff --git a/includes/Tokenizer.php b/includes/Tokenizer.php index 2771319641..2b9c9e7c21 100644 --- a/includes/Tokenizer.php +++ b/includes/Tokenizer.php @@ -3,14 +3,13 @@ class Tokenizer { /* private */ var $mText, # Text to be processed by the tokenizer $mPos, # current position of tokenizer in text $mTextLength, # Length of $mText - $mCount, # token count, computed in preParse - $mMatch, # matches of tokenizer regex, computed in preParse - $mMatchPos; # current token position of tokenizer. Each match can - # be up to two tokens: A matched token and the text after it. + $mQueuedToken; # Tokens that were already found, but not + # returned yet. /* private */ function Tokenizer() { $this->mPos=0; + $this->mTokenQueue=array(); } # factory function @@ -18,92 +17,133 @@ class Tokenizer { { $t = new Tokenizer(); $t->mText = $s; - $t->preParse(); $t->mTextLength = strlen( $s ); + // echo "New tokenizer generated.
{$s}
\n"; return $t; } - function preParse() - { - global $wgLang; - - # build up the regex, step by step. - # Basic features: Quotes for / and hyphens for
- $regex = "\'\'\'\'\'|\'\'\'|\'\'|\n-----*"; - # Append regex for linkPrefixExtension - if ( $wgLang->linkPrefixExtension() ) { - $regex .= "|([a-zA-Z\x80-\xff]+)\[\["; - } else { - # end tag that can start with 3 [ - $regex .= "|\[\[\[?"; - } - # Closing link - $regex .= "|\]\]"; - # Magic words that automatically generate links - $regex .= "|ISBN |RFC "; - # Language-specific additions - $regex .= $wgLang->tokenizerRegex(); - # Finalize regex - $regex = "/(" . $regex . ")/"; - # Apply the regex to the text - $this->mCount = preg_match_all( $regex, $this->mText, $this->mMatch, - PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE); - $this->mMatchPos=0; - } - - function nextToken() + // Return the next token, but do not increase the pointer. The next call + // to previewToken or nextToken will return the same token again. + // Actually, the pointer is increased, but the token is queued. The next + // call to previewToken or nextToken will check the queue and return + // the stored token. + function previewToken() { - $token = $this->previewToken(); - if ( $token ) { - $this->mMatchPos = $token["mMatchPos"]; - $this->mPos = $token["mPos"]; + if ( count( $this->mQueuedToken ) != 0 ) { + // still one token from the last round around. Return that one first. + $token = $this->mQueuedToken[0]; + } else { + $token = $this->nextToken(); + array_unshift( $this->mQueuedToken, $token ); } return $token; } - function previewToken() + // get the next token + // proceeds character by character through the text, looking for characters needing + // special attention. Those are currently: I, R, ', [, ], newline + // + // TODO: prefixed links for Arabic wikipedia not implemented yet + // handling of French blanks not yet implemented + function nextToken() { - if ( $this->mMatchPos < $this->mCount ) { - $token["pos"] = $this->mPos; - if ( $this->mPos < $this->mMatch[0][$this->mMatchPos][1] ) { - $token["type"] = "text"; - $token["text"] = substr( $this->mText, $this->mPos, - $this->mMatch[0][$this->mMatchPos][1] - $this->mPos ); - # What the pointers would change to if this would not just be a preview - $token["mMatchPos"] = $this->mMatchPos; - $token["mPos"] = $this->mMatch[0][$this->mMatchPos][1]; - } else { - # If linkPrefixExtension is set, $this->mMatch[2][$this->mMatchPos][0] - # contains the link prefix, or is null if no link prefix exist. - if ( isset( $this->mMatch[2] ) && $this->mMatch[2][$this->mMatchPos][0] ) - { - # prefixed link open tag, [0] is "prefix[[" - $token["type"] = "[["; - $token["text"] = $this->mMatch[2][$this->mMatchPos][0]; # the prefix - } else { - $token["type"] = $this->mMatch[0][$this->mMatchPos][0]; - if ( substr($token["type"],1,4) == "----" ) - { - # any number of hyphens bigger than four is a
. - # strip down to four. - $token["type"]="----"; - } - } - # What the pointers would change to if this would not just be a preview - $token["mPos"] = $this->mPos + strlen( $this->mMatch[0][$this->mMatchPos][0] ); - $token["mMatchPos"] = $this->mMatchPos + 1; - } - } elseif ( $this->mPos < $this->mTextLength ) { - $token["type"] = "text"; - $token["text"] = substr( $this->mText, $this->mPos ); - # What the pointers would change to if this would not just be a preview - $token["mPos"] = $this->mTextLength; - $token["mMatchPos"] = $this->mMatchPos; + if ( count( $this->mQueuedToken ) != 0 ) { + // still one token from the last round around. Return that one first. + $token = array_shift( $this->mQueuedToken ); } else { - $token = FALSE; - } + + $token["text"]=""; + $token["type"]="text"; + + // If no text is left, return "false". + if ( $this->mPos > $this->mTextLength ) + return false; + + while ( $this->mPos <= $this->mTextLength ) { + switch ( $ch = $this->mText[$this->mPos] ) { + case 'R': // for "RFC " + if ( $this->mText[$this->mPos+1] == 'F' && + $this->mText[$this->mPos+2] == 'C' && + $this->mText[$this->mPos+4] == ' ' ) { + $queueToken["type"] = $queueToken["text"] = "RFC "; + $this->mQueuedToken[] = $queueToken; + $this->mPos += 3; + break 2; // switch + while + } + break; + case 'I': // for "ISBN " + if ( $this->mText[$this->mPos+1] == 'S' && + $this->mText[$this->mPos+2] == 'B' && + $this->mText[$this->mPos+3] == 'N' && + $this->mText[$this->mPos+4] == ' ' ) { + $queueToken["type"] = $queueToken["text"] = "ISBN "; + $this->mQueuedToken[] = $queueToken; + $this->mPos += 4; + break 2; // switch + while + } + break; + case "[": // for links "[[" + if ( $this->mText[$this->mPos+1] == "[" && + $this->mText[$this->mPos+2] == "[" ) { + $queueToken["type"] = "[[["; + $queueToken["text"] = ""; + $this->mQueuedToken[] = $queueToken; + $this->mPos += 3; + break 2; // switch + while + } else if ( $this->mText[$this->mPos+1] == "[" ) { + $queueToken["type"] = "[["; + $queueToken["text"] = ""; + $this->mQueuedToken[] = $queueToken; + $this->mPos += 2; + break 2; // switch + while + } + break; + case "]": // for end of links "]]" + if ( $this->mText[$this->mPos+1] == "]" ) { + $queueToken["type"] = "]]"; + $queueToken["text"] = ""; + $this->mQueuedToken[] = $queueToken; + $this->mPos += 2; + break 2; // switch + while + } + break; + case "'": // for all kind of em's and strong's + if ( $this->mText[$this->mPos+1] == "'" ) { + $queueToken["type"] = "'"; + $queueToken["text"] = ""; + while ( $this->mText[$this->mPos+1] == "'" ) { + $queueToken["type"] .= "'"; + $this->mPos ++; + } + + $this->mQueuedToken[] = $queueToken; + $this->mPos ++; + break 2; // switch + while + } + break; + case "\n": // for block levels, actually, only "----" is handled. + case "\r": + if ( $this->mText[$this->mPos+1] == "-" && + $this->mText[$this->mPos+2] == "-" && + $this->mText[$this->mPos+3] == "-" && + $this->mText[$this->mPos+4] == "-" ) { + $queueToken["type"] = "----"; + $queueToken["text"] = ""; + $this->mQueuedToken[] = $queueToken; + $this->mPos += 5; + while ($this->mText[$this->mPos] == "-" ) { + $this->mPos ++; + } + break 2; + } + } /* switch */ + $token["text"].=$ch; + $this->mPos ++; + // echo $this->mPos . "
\n"; + } /* while */ + } /* if (nothing left in queue) */ return $token; }