10 /* private */ var $mText, # Text to be processed by the tokenizer
11 $mPos, # current position of tokenizer in text
12 $mTextLength, # Length of $mText
13 $mQueuedToken; # Tokens that were already found, but not
20 function Tokenizer() {
24 $this->mTokenQueue
=array();
25 $this->linkPrefixExtension
= $wgLang->linkPrefixExtension();
31 function newFromString( $s ) {
32 $fname = 'Tokenizer::newFromString';
33 wfProfileIn( $fname );
37 $t->mTextLength
= strlen( $s );
39 wfProfileOut( $fname );
45 * Return the next token, but do not increase the pointer. The next call
46 * to previewToken or nextToken will return the same token again.
47 * Actually, the pointer is increased, but the token is queued. The next
48 * call to previewToken or nextToken will check the queue and return
51 function previewToken() {
52 $fname = 'Tokenizer::previewToken';
53 wfProfileIn( $fname );
55 if ( count( $this->mQueuedToken
) != 0 ) {
56 // still one token from the last round around. Return that one first.
57 $token = $this->mQueuedToken
[0];
59 $token = $this->nextToken();
60 array_unshift( $this->mQueuedToken
, $token );
63 wfProfileOut( $fname );
71 * proceeds character by character through the text, looking for characters needing
72 * special attention. Those are currently: I, R, ', [, ], newline
74 * @todo handling of French blanks not yet implemented
76 function nextToken() {
77 $fname = 'Tokenizer::nextToken';
78 wfProfileIn( $fname );
80 if ( count( $this->mQueuedToken
) != 0 ) {
81 // still one token from the last round around. Return that one first.
82 $token = array_shift( $this->mQueuedToken
);
83 } else if ( $this->mPos
> $this->mTextLength
) {
84 // If no text is left, return 'false'.
89 $token['type']='text';
91 while ( $this->mPos
<= $this->mTextLength
) {
92 switch ( @$ch = $this->mText
[$this->mPos
] ) {
93 case 'R': // for "RFC "
94 if ( $this->continues('FC ') ) {
95 $queueToken['type'] = $queueToken['text'] = 'RFC ';
96 $this->mQueuedToken
[] = $queueToken;
98 break 2; // switch + while
101 case 'I': // for "ISBN "
102 if ( $this->continues('SBN ') ) {
103 $queueToken['type'] = $queueToken['text'] = 'ISBN ';
104 $this->mQueuedToken
[] = $queueToken;
106 break 2; // switch + while
109 case '[': // for links "[["
110 if ( $this->continues('[[') ) {
111 $queueToken['type'] = '[[[';
112 $queueToken['text'] = '';
113 $this->mQueuedToken
[] = $queueToken;
115 break 2; // switch + while
116 } else if ( $this->continues('[') ) {
117 $queueToken['type'] = '[[';
118 $queueToken['text'] = '';
119 // Check for a "prefixed link", e.g. Al[[Khazar]]
120 // Mostly for arabic wikipedia
121 if ( $this->linkPrefixExtension
) {
122 while ( $this->linkPrefixExtension
123 && ($len = strlen( $token['text'] ) ) > 0
124 && !ctype_space( $token['text'][$len-1] ) )
126 //prepend the character to the link's open tag
127 $queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
128 //remove character from the end of the text token
129 $token['text'] = substr( $token['text'], 0, -1);
132 $this->mQueuedToken
[] = $queueToken;
134 break 2; // switch + while
137 case ']': // for end of links "]]"
138 if ( $this->continues(']') ) {
139 $queueToken['type'] = ']]';
140 $queueToken['text'] = '';
141 $this->mQueuedToken
[] = $queueToken;
143 break 2; // switch + while
146 case "'": // for all kind of em's and strong's
147 if ( $this->continues("'") ) {
148 $queueToken['type'] = "'";
149 $queueToken['text'] = '';
150 while( ($this->mPos+
1 < $this->mTextLength
)
151 && $this->mText
[$this->mPos+
1] == "'" )
153 $queueToken['type'] .= "'";
154 $queueToken['pos'] = $this->mPos
;
158 $this->mQueuedToken
[] = $queueToken;
160 break 2; // switch + while
163 case "\n": // for block levels, actually, only "----" is handled.
164 case "\r": // headings are detected to close any unbalanced em or strong tags in a section
165 if ( $this->continues( '----' ) )
167 $queueToken['type'] = '----';
168 $queueToken['text'] = '';
169 $this->mQueuedToken
[] = $queueToken;
171 while ( $this->mPos
<$this->mTextLength
172 and $this->mText
[$this->mPos
] == '-' )
178 $this->continues( '<h' ) and (
179 $this->continues( '<h1' ) or
180 $this->continues( '<h2' ) or
181 $this->continues( '<h3' ) or
182 $this->continues( '<h4' ) or
183 $this->continues( '<h5' ) or
184 $this->continues( '<h6' )
187 $queueToken['type'] = 'h';
188 $queueToken['text'] = '';
189 $this->mQueuedToken
[] = $queueToken;
191 break 2; // switch + while
194 case '!': // French spacing rules have a space before exclamation
195 case '?': // and question marks. Those have to become
196 case ':': // And colons, Hashar says ...
197 if ( $this->preceeded( ' ' ) )
199 // strip blank from Token
200 $token['text'] = substr( $token['text'], 0, -1 );
201 $queueToken['type'] = 'blank';
202 $queueToken['text'] = ' '.$ch;
203 $this->mQueuedToken
[] = $queueToken;
205 break 2; // switch + while
208 case '0': // A space between two numbers is used to ease reading
209 case '1': // of big numbers, e.g. 1 000 000. Those spaces need
210 case '2': // to be unbreakable
218 if ( ($this->mTextLength
>= $this->mPos +
2)
219 && ($this->mText
[$this->mPos+
1] == ' ')
220 && ctype_digit( $this->mText
[$this->mPos+
2] ) )
222 $queueToken['type'] = 'blank';
223 $queueToken['text'] = $ch . ' ';
224 $this->mQueuedToken
[] = $queueToken;
226 break 2; // switch + while
229 case "\302": // first byte of UTF-8 Character Guillemet-left
230 if ( $this->continues( "\253 ") ) // second byte and a blank
232 $queueToken['type'] = 'blank';
233 $queueToken['text'] = "\302\253 ";
234 $this->mQueuedToken
[] = $queueToken;
236 break 2; // switch + while
239 case "\273": //last byte of UTF-8 Character Guillemet-right
240 if ( $this->preceeded( " \302" ) )
242 $queueToken['type'] = 'blank';
243 $queueToken['text'] = " \302\273";
244 $token['text'] = substr( $token['text'], 0, -2 );
245 $this->mQueuedToken
[] = $queueToken;
247 break 2; // switch + while
250 case '&': //extensions like <timeline>, since HTML stripping has already been done,
251 //those look like <timeline>
252 if ( $this->continues( 'lt;timeline>' ) )
254 $queueToken['type'] = '<timeline>';
255 $queueToken['text'] = '<timeline>';
256 $this->mQueuedToken
[] = $queueToken;
258 break 2; // switch + while
265 // echo $this->mPos . "<br>\n";
267 } /* if (nothing left in queue) */
269 wfProfileOut( $fname );
276 * checks whether the mText continues with $cont from mPos+1
280 function continues( $cont ) {
281 // If string is not long enough to contain $cont, return false
282 if ( $this->mTextLength
< $this->mPos +
strlen( $cont ) )
284 for ( $i=0; $i < strlen( $cont ); $i++
)
286 if ( $this->mText
[$this->mPos+
1+
$i] != $cont[$i] )
295 * checks whether the mText is preceeded by $prec at position mPos
299 function preceeded( $prec ) {
300 $len = strlen( $prec );
301 // if $prec is longer than the text up to mPos, return false
302 if ( $this->mPos
< $len )
304 return ( 0 == strcmp( $prec, substr($this->mText
, $this->mPos
-$len, $len) ) );
310 function readAllUntil( $border ) {
311 $n = strpos( $this->mText
, $border, $this->mPos
);
314 $ret = substr( $this->mText
, $this->mPos
, $n - $this->mPos
);
315 $this->mPos
= $n +
strlen( $border ) +
1;