DB error log
[lhc/web/wiklou.git] / includes / Tokenizer.php
1 <?php
2 class Tokenizer {
3 /* private */ var $mText, # Text to be processed by the tokenizer
4 $mPos, # current position of tokenizer in text
5 $mTextLength, # Length of $mText
6 $mQueuedToken; # Tokens that were already found, but not
7 # returned yet.
8
9 /* private */ function Tokenizer()
10 {
11 global $wgLang;
12
13 $this->mPos=0;
14 $this->mTokenQueue=array();
15 $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
16 }
17
18 # factory function
19 function newFromString( $s )
20 {
21 $fname = 'Tokenizer::newFromString';
22 wfProfileIn( $fname );
23
24 $t = new Tokenizer();
25 $t->mText = $s;
26 $t->mTextLength = strlen( $s );
27
28 wfProfileOut( $fname );
29 return $t;
30 }
31
32
33 // Return the next token, but do not increase the pointer. The next call
34 // to previewToken or nextToken will return the same token again.
35 // Actually, the pointer is increased, but the token is queued. The next
36 // call to previewToken or nextToken will check the queue and return
37 // the stored token.
38 function previewToken()
39 {
40 $fname = 'Tokenizer::previewToken';
41 wfProfileIn( $fname );
42
43 if ( count( $this->mQueuedToken ) != 0 ) {
44 // still one token from the last round around. Return that one first.
45 $token = $this->mQueuedToken[0];
46 } else {
47 $token = $this->nextToken();
48 array_unshift( $this->mQueuedToken, $token );
49 }
50
51 wfProfileOut( $fname );
52 return $token;
53 }
54
55
56 // get the next token
57 // proceeds character by character through the text, looking for characters needing
58 // special attention. Those are currently: I, R, ', [, ], newline
59 //
60 // TODO: handling of French blanks not yet implemented
61 function nextToken()
62 {
63 $fname = 'Tokenizer::nextToken';
64 wfProfileIn( $fname );
65
66 if ( count( $this->mQueuedToken ) != 0 ) {
67 // still one token from the last round around. Return that one first.
68 $token = array_shift( $this->mQueuedToken );
69 } else if ( $this->mPos > $this->mTextLength ) {
70 // If no text is left, return "false".
71 $token = false;
72 } else {
73
74 $token['text']='';
75 $token['type']='text';
76
77 while ( $this->mPos <= $this->mTextLength ) {
78 switch ( @$ch = $this->mText[$this->mPos] ) {
79 case 'R': // for "RFC "
80 if ( $this->continues('FC ') ) {
81 $queueToken['type'] = $queueToken['text'] = 'RFC ';
82 $this->mQueuedToken[] = $queueToken;
83 $this->mPos += 3;
84 break 2; // switch + while
85 }
86 break;
87 case 'I': // for "ISBN "
88 if ( $this->continues('SBN ') ) {
89 $queueToken['type'] = $queueToken['text'] = 'ISBN ';
90 $this->mQueuedToken[] = $queueToken;
91 $this->mPos += 4;
92 break 2; // switch + while
93 }
94 break;
95 case '[': // for links "[["
96 if ( $this->continues('[[') ) {
97 $queueToken['type'] = '[[[';
98 $queueToken['text'] = '';
99 $this->mQueuedToken[] = $queueToken;
100 $this->mPos += 3;
101 break 2; // switch + while
102 } else if ( $this->continues('[') ) {
103 $queueToken['type'] = '[[';
104 $queueToken['text'] = '';
105 // Check for a "prefixed link", e.g. Al[[Khazar]]
106 // Mostly for arabic wikipedia
107 if ( $this->linkPrefixExtension ) {
108 while ( $this->linkPrefixExtension
109 && ($len = strlen( $token['text'] ) ) > 0
110 && !ctype_space( $token['text'][$len-1] ) )
111 {
112 //prepend the character to the link's open tag
113 $queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
114 //remove character from the end of the text token
115 $token['text'] = substr( $token['text'], 0, -1);
116 }
117 }
118 $this->mQueuedToken[] = $queueToken;
119 $this->mPos += 2;
120 break 2; // switch + while
121 }
122 break;
123 case ']': // for end of links "]]"
124 if ( $this->continues(']') ) {
125 $queueToken['type'] = ']]';
126 $queueToken['text'] = '';
127 $this->mQueuedToken[] = $queueToken;
128 $this->mPos += 2;
129 break 2; // switch + while
130 }
131 break;
132 case "'": // for all kind of em's and strong's
133 if ( $this->continues("'") ) {
134 $queueToken['type'] = "'";
135 $queueToken['text'] = '';
136 while( ($this->mPos+1 < $this->mTextLength)
137 && $this->mText[$this->mPos+1] == "'" )
138 {
139 $queueToken['type'] .= "'";
140 $queueToken['pos'] = $this->mPos;
141 $this->mPos ++;
142 }
143
144 $this->mQueuedToken[] = $queueToken;
145 $this->mPos ++;
146 break 2; // switch + while
147 }
148 break;
149 case "\n": // for block levels, actually, only "----" is handled.
150 case "\r": // headings are detected to close any unbalanced em or strong tags in a section
151 if ( $this->continues( '----' ) )
152 {
153 $queueToken['type'] = '----';
154 $queueToken['text'] = '';
155 $this->mQueuedToken[] = $queueToken;
156 $this->mPos += 5;
157 while ( $this->mPos<$this->mTextLength
158 and $this->mText[$this->mPos] == '-' )
159 {
160 $this->mPos ++;
161 }
162 break 2;
163 } else if (
164 $this->continues( '<h' ) and (
165 $this->continues( '<h1' ) or
166 $this->continues( '<h2' ) or
167 $this->continues( '<h3' ) or
168 $this->continues( '<h4' ) or
169 $this->continues( '<h5' ) or
170 $this->continues( '<h6' )
171 )
172 ) { // heading
173 $queueToken['type'] = 'h';
174 $queueToken['text'] = '';
175 $this->mQueuedToken[] = $queueToken;
176 $this->mPos ++;
177 break 2; // switch + while
178 }
179 break;
180 case '!': // French spacing rules have a space before exclamation
181 case '?': // and question marks. Those have to become &nbsp;
182 case ':': // And colons, Hashar says ...
183 if ( $this->preceeded( ' ' ) )
184 {
185 // strip blank from Token
186 $token['text'] = substr( $token['text'], 0, -1 );
187 $queueToken['type'] = 'blank';
188 $queueToken['text'] = " {$ch}";
189 $this->mQueuedToken[] = $queueToken;
190 $this->mPos ++;
191 break 2; // switch + while
192 }
193 break;
194 case '0': // A space between two numbers is used to ease reading
195 case '1': // of big numbers, e.g. 1 000 000. Those spaces need
196 case '2': // to be unbreakable
197 case '3':
198 case '4':
199 case '5':
200 case '6':
201 case '7':
202 case '8':
203 case '9':
204 if ( ($this->mTextLength >= $this->mPos +2)
205 && ($this->mText[$this->mPos+1] == " ")
206 && ctype_digit( $this->mText[$this->mPos+2] ) )
207 {
208 $queueToken['type'] = 'blank';
209 $queueToken['text'] = $ch . ' ';
210 $this->mQueuedToken[] = $queueToken;
211 $this->mPos += 2;
212 break 2; // switch + while
213 }
214 break;
215 case "\302": // first byte of UTF-8 Character Guillemet-left
216 if ( $this->continues( "\253 ") ) // second byte and a blank
217 {
218 $queueToken['type'] = 'blank';
219 $queueToken['text'] = "\302\253 ";
220 $this->mQueuedToken[] = $queueToken;
221 $this->mPos += 3;
222 break 2; // switch + while
223 }
224 break;
225 case "\273": //last byte of UTF-8 Character Guillemet-right
226 if ( $this->preceeded( " \302" ) )
227 {
228 $queueToken['type'] = 'blank';
229 $queueToken['text'] = " \302\273";
230 $token['text'] = substr( $token['text'], 0, -2 );
231 $this->mQueuedToken[] = $queueToken;
232 $this->mPos ++;
233 break 2; // switch + while
234 }
235 break;
236 case '&': //extensions like <timeline>, since HTML stripping has already been done,
237 //those look like &lt;timeline&gt;
238 if ( $this->continues( "lt;timeline&gt;" ) )
239 {
240 $queueToken['type'] = "<timeline>";
241 $queueToken['text'] = "&lt;timeline&gt;";
242 $this->mQueuedToken[] = $queueToken;
243 $this->mPos += 16;
244 break 2; // switch + while
245 }
246 break;
247
248 } /* switch */
249 $token['text'].=$ch;
250 $this->mPos ++;
251 // echo $this->mPos . "<br>\n";
252 } /* while */
253 } /* if (nothing left in queue) */
254
255 wfProfileOut( $fname );
256 return $token;
257 }
258
259 // function continues
260 // checks whether the mText continues with $cont from mPos+1
261 /* private */ function continues( $cont )
262 {
263 // If string is not long enough to contain $cont, return false
264 if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
265 return false;
266 for ( $i=0; $i < strlen( $cont ); $i++ )
267 {
268 if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
269 return false;
270 }
271 return true;
272 }
273
274 // function preceeded
275 // checks whether the mText is preceeded by $prec at position mPos
276 /* private */ function preceeded( $prec )
277 {
278 $len = strlen( $prec );
279 // if $prec is longer than the text up to mPos, return false
280 if ( $this->mPos < $len )
281 return false;
282 return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
283 }
284
285 function readAllUntil( $border )
286 {
287 $n = strpos( $this->mText, $border, $this->mPos );
288 if ( $n === false )
289 return '';
290 $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
291 $this->mPos = $n + strlen( $border ) + 1;
292 return $ret;
293 }
294
295 }