Changing comments layout preparing for generated documentation with Phpdocumentor
[lhc/web/wiklou.git] / includes / Tokenizer.php
1 <?php
2 /**
3 *
4 */
5
6 /**
7 *
8 */
9 class Tokenizer {
10 /* private */ var $mText, # Text to be processed by the tokenizer
11 $mPos, # current position of tokenizer in text
12 $mTextLength, # Length of $mText
13 $mQueuedToken; # Tokens that were already found, but not
14 # returned yet.
15
16 /**
17 * Constructor
18 * @access private
19 */
20 function Tokenizer() {
21 global $wgLang;
22
23 $this->mPos=0;
24 $this->mTokenQueue=array();
25 $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
26 }
27
28 /**
29 * factory function
30 */
31 function newFromString( $s ) {
32 $fname = 'Tokenizer::newFromString';
33 wfProfileIn( $fname );
34
35 $t = new Tokenizer();
36 $t->mText = $s;
37 $t->mTextLength = strlen( $s );
38
39 wfProfileOut( $fname );
40 return $t;
41 }
42
43
44 /**
45 * Return the next token, but do not increase the pointer. The next call
46 * to previewToken or nextToken will return the same token again.
47 * Actually, the pointer is increased, but the token is queued. The next
48 * call to previewToken or nextToken will check the queue and return
49 * the stored token.
50 */
51 function previewToken() {
52 $fname = 'Tokenizer::previewToken';
53 wfProfileIn( $fname );
54
55 if ( count( $this->mQueuedToken ) != 0 ) {
56 // still one token from the last round around. Return that one first.
57 $token = $this->mQueuedToken[0];
58 } else {
59 $token = $this->nextToken();
60 array_unshift( $this->mQueuedToken, $token );
61 }
62
63 wfProfileOut( $fname );
64 return $token;
65 }
66
67
68 /**
69 * Get the next token.
70 *
71 * proceeds character by character through the text, looking for characters needing
72 * special attention. Those are currently: I, R, ', [, ], newline
73 *
74 * @todo handling of French blanks not yet implemented
75 */
76 function nextToken() {
77 $fname = 'Tokenizer::nextToken';
78 wfProfileIn( $fname );
79
80 if ( count( $this->mQueuedToken ) != 0 ) {
81 // still one token from the last round around. Return that one first.
82 $token = array_shift( $this->mQueuedToken );
83 } else if ( $this->mPos > $this->mTextLength ) {
84 // If no text is left, return 'false'.
85 $token = false;
86 } else {
87
88 $token['text']='';
89 $token['type']='text';
90
91 while ( $this->mPos <= $this->mTextLength ) {
92 switch ( @$ch = $this->mText[$this->mPos] ) {
93 case 'R': // for "RFC "
94 if ( $this->continues('FC ') ) {
95 $queueToken['type'] = $queueToken['text'] = 'RFC ';
96 $this->mQueuedToken[] = $queueToken;
97 $this->mPos += 3;
98 break 2; // switch + while
99 }
100 break;
101 case 'I': // for "ISBN "
102 if ( $this->continues('SBN ') ) {
103 $queueToken['type'] = $queueToken['text'] = 'ISBN ';
104 $this->mQueuedToken[] = $queueToken;
105 $this->mPos += 4;
106 break 2; // switch + while
107 }
108 break;
109 case '[': // for links "[["
110 if ( $this->continues('[[') ) {
111 $queueToken['type'] = '[[[';
112 $queueToken['text'] = '';
113 $this->mQueuedToken[] = $queueToken;
114 $this->mPos += 3;
115 break 2; // switch + while
116 } else if ( $this->continues('[') ) {
117 $queueToken['type'] = '[[';
118 $queueToken['text'] = '';
119 // Check for a "prefixed link", e.g. Al[[Khazar]]
120 // Mostly for arabic wikipedia
121 if ( $this->linkPrefixExtension ) {
122 while ( $this->linkPrefixExtension
123 && ($len = strlen( $token['text'] ) ) > 0
124 && !ctype_space( $token['text'][$len-1] ) )
125 {
126 //prepend the character to the link's open tag
127 $queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
128 //remove character from the end of the text token
129 $token['text'] = substr( $token['text'], 0, -1);
130 }
131 }
132 $this->mQueuedToken[] = $queueToken;
133 $this->mPos += 2;
134 break 2; // switch + while
135 }
136 break;
137 case ']': // for end of links "]]"
138 if ( $this->continues(']') ) {
139 $queueToken['type'] = ']]';
140 $queueToken['text'] = '';
141 $this->mQueuedToken[] = $queueToken;
142 $this->mPos += 2;
143 break 2; // switch + while
144 }
145 break;
146 case "'": // for all kind of em's and strong's
147 if ( $this->continues("'") ) {
148 $queueToken['type'] = "'";
149 $queueToken['text'] = '';
150 while( ($this->mPos+1 < $this->mTextLength)
151 && $this->mText[$this->mPos+1] == "'" )
152 {
153 $queueToken['type'] .= "'";
154 $queueToken['pos'] = $this->mPos;
155 $this->mPos ++;
156 }
157
158 $this->mQueuedToken[] = $queueToken;
159 $this->mPos ++;
160 break 2; // switch + while
161 }
162 break;
163 case "\n": // for block levels, actually, only "----" is handled.
164 case "\r": // headings are detected to close any unbalanced em or strong tags in a section
165 if ( $this->continues( '----' ) )
166 {
167 $queueToken['type'] = '----';
168 $queueToken['text'] = '';
169 $this->mQueuedToken[] = $queueToken;
170 $this->mPos += 5;
171 while ( $this->mPos<$this->mTextLength
172 and $this->mText[$this->mPos] == '-' )
173 {
174 $this->mPos ++;
175 }
176 break 2;
177 } else if (
178 $this->continues( '<h' ) and (
179 $this->continues( '<h1' ) or
180 $this->continues( '<h2' ) or
181 $this->continues( '<h3' ) or
182 $this->continues( '<h4' ) or
183 $this->continues( '<h5' ) or
184 $this->continues( '<h6' )
185 )
186 ) { // heading
187 $queueToken['type'] = 'h';
188 $queueToken['text'] = '';
189 $this->mQueuedToken[] = $queueToken;
190 $this->mPos ++;
191 break 2; // switch + while
192 }
193 break;
194 case '!': // French spacing rules have a space before exclamation
195 case '?': // and question marks. Those have to become &nbsp;
196 case ':': // And colons, Hashar says ...
197 if ( $this->preceeded( ' ' ) )
198 {
199 // strip blank from Token
200 $token['text'] = substr( $token['text'], 0, -1 );
201 $queueToken['type'] = 'blank';
202 $queueToken['text'] = ' '.$ch;
203 $this->mQueuedToken[] = $queueToken;
204 $this->mPos ++;
205 break 2; // switch + while
206 }
207 break;
208 case '0': // A space between two numbers is used to ease reading
209 case '1': // of big numbers, e.g. 1 000 000. Those spaces need
210 case '2': // to be unbreakable
211 case '3':
212 case '4':
213 case '5':
214 case '6':
215 case '7':
216 case '8':
217 case '9':
218 if ( ($this->mTextLength >= $this->mPos +2)
219 && ($this->mText[$this->mPos+1] == ' ')
220 && ctype_digit( $this->mText[$this->mPos+2] ) )
221 {
222 $queueToken['type'] = 'blank';
223 $queueToken['text'] = $ch . ' ';
224 $this->mQueuedToken[] = $queueToken;
225 $this->mPos += 2;
226 break 2; // switch + while
227 }
228 break;
229 case "\302": // first byte of UTF-8 Character Guillemet-left
230 if ( $this->continues( "\253 ") ) // second byte and a blank
231 {
232 $queueToken['type'] = 'blank';
233 $queueToken['text'] = "\302\253 ";
234 $this->mQueuedToken[] = $queueToken;
235 $this->mPos += 3;
236 break 2; // switch + while
237 }
238 break;
239 case "\273": //last byte of UTF-8 Character Guillemet-right
240 if ( $this->preceeded( " \302" ) )
241 {
242 $queueToken['type'] = 'blank';
243 $queueToken['text'] = " \302\273";
244 $token['text'] = substr( $token['text'], 0, -2 );
245 $this->mQueuedToken[] = $queueToken;
246 $this->mPos ++;
247 break 2; // switch + while
248 }
249 break;
250 case '&': //extensions like <timeline>, since HTML stripping has already been done,
251 //those look like &lt;timeline&gt;
252 if ( $this->continues( 'lt;timeline&gt;' ) )
253 {
254 $queueToken['type'] = '<timeline>';
255 $queueToken['text'] = '&lt;timeline&gt;';
256 $this->mQueuedToken[] = $queueToken;
257 $this->mPos += 16;
258 break 2; // switch + while
259 }
260 break;
261
262 } /* switch */
263 $token['text'].=$ch;
264 $this->mPos ++;
265 // echo $this->mPos . "<br>\n";
266 } /* while */
267 } /* if (nothing left in queue) */
268
269 wfProfileOut( $fname );
270 return $token;
271 }
272
273 /**
274 * function continues
275 *
276 * checks whether the mText continues with $cont from mPos+1
277 *
278 * @access private
279 */
280 function continues( $cont ) {
281 // If string is not long enough to contain $cont, return false
282 if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
283 return false;
284 for ( $i=0; $i < strlen( $cont ); $i++ )
285 {
286 if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
287 return false;
288 }
289 return true;
290 }
291
292 /**
293 * function preceeded
294 *
295 * checks whether the mText is preceeded by $prec at position mPos
296 *
297 * @access private
298 */
299 function preceeded( $prec ) {
300 $len = strlen( $prec );
301 // if $prec is longer than the text up to mPos, return false
302 if ( $this->mPos < $len )
303 return false;
304 return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
305 }
306
307 /**
308 *
309 */
310 function readAllUntil( $border ) {
311 $n = strpos( $this->mText, $border, $this->mPos );
312 if ( $n === false )
313 return '';
314 $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
315 $this->mPos = $n + strlen( $border ) + 1;
316 return $ret;
317 }
318
319 }