Split out doBlockLevels() into its own class
[lhc/web/wiklou.git] / includes / parser / BlockLevelPass.php
1 <?php
2
3 /**
4 * This is the part of the wikitext parser which handles automatic paragraphs
5 * and conversion of start-of-line prefixes to HTML lists.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 * http://www.gnu.org/copyleft/gpl.html
21 *
22 * @file
23 * @ingroup Parser
24 */
25 class BlockLevelPass {
26 private $mDTopen = false;
27 private $mInPre = false;
28 private $mLastSection = '';
29 private $linestart;
30 private $text;
31
32 # State constants for the definition list colon extraction
33 const COLON_STATE_TEXT = 0;
34 const COLON_STATE_TAG = 1;
35 const COLON_STATE_TAGSTART = 2;
36 const COLON_STATE_CLOSETAG = 3;
37 const COLON_STATE_TAGSLASH = 4;
38 const COLON_STATE_COMMENT = 5;
39 const COLON_STATE_COMMENTDASH = 6;
40 const COLON_STATE_COMMENTDASHDASH = 7;
41
42 /**
43 * Make lists from lines starting with ':', '*', '#', etc.
44 *
45 * @param string $text
46 * @param bool $linestart Whether or not this is at the start of a line.
47 * @return string The lists rendered as HTML
48 */
49 public static function doBlockLevels( $text, $linestart ) {
50 $pass = new self( $text, $linestart );
51 return $pass->execute();
52 }
53
54 private function __construct( $text, $linestart ) {
55 $this->text = $text;
56 $this->linestart = $linestart;
57 }
58
59 /**
60 * @return string
61 */
62 private function closeParagraph() {
63 $result = '';
64 if ( $this->mLastSection != '' ) {
65 $result = '</' . $this->mLastSection . ">\n";
66 }
67 $this->mInPre = false;
68 $this->mLastSection = '';
69 return $result;
70 }
71
72 /**
73 * getCommon() returns the length of the longest common substring
74 * of both arguments, starting at the beginning of both.
75 *
76 * @param string $st1
77 * @param string $st2
78 *
79 * @return int
80 */
81 private function getCommon( $st1, $st2 ) {
82 $fl = strlen( $st1 );
83 $shorter = strlen( $st2 );
84 if ( $fl < $shorter ) {
85 $shorter = $fl;
86 }
87
88 for ( $i = 0; $i < $shorter; ++$i ) {
89 if ( $st1[$i] != $st2[$i] ) {
90 break;
91 }
92 }
93 return $i;
94 }
95
96 /**
97 * These next three functions open, continue, and close the list
98 * element appropriate to the prefix character passed into them.
99 *
100 * @param string $char
101 *
102 * @return string
103 */
104 private function openList( $char ) {
105 $result = $this->closeParagraph();
106
107 if ( '*' === $char ) {
108 $result .= "<ul><li>";
109 } elseif ( '#' === $char ) {
110 $result .= "<ol><li>";
111 } elseif ( ':' === $char ) {
112 $result .= "<dl><dd>";
113 } elseif ( ';' === $char ) {
114 $result .= "<dl><dt>";
115 $this->mDTopen = true;
116 } else {
117 $result = '<!-- ERR 1 -->';
118 }
119
120 return $result;
121 }
122
123 /**
124 * TODO: document
125 * @param string $char
126 *
127 * @return string
128 */
129 private function nextItem( $char ) {
130 if ( '*' === $char || '#' === $char ) {
131 return "</li>\n<li>";
132 } elseif ( ':' === $char || ';' === $char ) {
133 $close = "</dd>\n";
134 if ( $this->mDTopen ) {
135 $close = "</dt>\n";
136 }
137 if ( ';' === $char ) {
138 $this->mDTopen = true;
139 return $close . '<dt>';
140 } else {
141 $this->mDTopen = false;
142 return $close . '<dd>';
143 }
144 }
145 return '<!-- ERR 2 -->';
146 }
147
148 /**
149 * @todo Document
150 * @param string $char
151 *
152 * @return string
153 */
154 private function closeList( $char ) {
155 if ( '*' === $char ) {
156 $text = "</li></ul>";
157 } elseif ( '#' === $char ) {
158 $text = "</li></ol>";
159 } elseif ( ':' === $char ) {
160 if ( $this->mDTopen ) {
161 $this->mDTopen = false;
162 $text = "</dt></dl>";
163 } else {
164 $text = "</dd></dl>";
165 }
166 } else {
167 return '<!-- ERR 3 -->';
168 }
169 return $text;
170 }
171 /**#@-*/
172
173 private function execute() {
174 $text = $this->text;
175 # Parsing through the text line by line. The main thing
176 # happening here is handling of block-level elements p, pre,
177 # and making lists from lines starting with * # : etc.
178 $textLines = StringUtils::explode( "\n", $text );
179
180 $lastPrefix = $output = '';
181 $this->mDTopen = $inBlockElem = false;
182 $prefixLength = 0;
183 $paragraphStack = false;
184 $inBlockquote = false;
185
186 foreach ( $textLines as $oLine ) {
187 # Fix up $linestart
188 if ( !$this->linestart ) {
189 $output .= $oLine;
190 $this->linestart = true;
191 continue;
192 }
193 # * = ul
194 # # = ol
195 # ; = dt
196 # : = dd
197
198 $lastPrefixLength = strlen( $lastPrefix );
199 $preCloseMatch = preg_match( '/<\\/pre/i', $oLine );
200 $preOpenMatch = preg_match( '/<pre/i', $oLine );
201 # If not in a <pre> element, scan for and figure out what prefixes are there.
202 if ( !$this->mInPre ) {
203 # Multiple prefixes may abut each other for nested lists.
204 $prefixLength = strspn( $oLine, '*#:;' );
205 $prefix = substr( $oLine, 0, $prefixLength );
206
207 # eh?
208 # ; and : are both from definition-lists, so they're equivalent
209 # for the purposes of determining whether or not we need to open/close
210 # elements.
211 $prefix2 = str_replace( ';', ':', $prefix );
212 $t = substr( $oLine, $prefixLength );
213 $this->mInPre = (bool)$preOpenMatch;
214 } else {
215 # Don't interpret any other prefixes in preformatted text
216 $prefixLength = 0;
217 $prefix = $prefix2 = '';
218 $t = $oLine;
219 }
220
221 # List generation
222 if ( $prefixLength && $lastPrefix === $prefix2 ) {
223 # Same as the last item, so no need to deal with nesting or opening stuff
224 $output .= $this->nextItem( substr( $prefix, -1 ) );
225 $paragraphStack = false;
226
227 if ( substr( $prefix, -1 ) === ';' ) {
228 # The one nasty exception: definition lists work like this:
229 # ; title : definition text
230 # So we check for : in the remainder text to split up the
231 # title and definition, without b0rking links.
232 $term = $t2 = '';
233 if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) {
234 $t = $t2;
235 $output .= $term . $this->nextItem( ':' );
236 }
237 }
238 } elseif ( $prefixLength || $lastPrefixLength ) {
239 # We need to open or close prefixes, or both.
240
241 # Either open or close a level...
242 $commonPrefixLength = $this->getCommon( $prefix, $lastPrefix );
243 $paragraphStack = false;
244
245 # Close all the prefixes which aren't shared.
246 while ( $commonPrefixLength < $lastPrefixLength ) {
247 $output .= $this->closeList( $lastPrefix[$lastPrefixLength - 1] );
248 --$lastPrefixLength;
249 }
250
251 # Continue the current prefix if appropriate.
252 if ( $prefixLength <= $commonPrefixLength && $commonPrefixLength > 0 ) {
253 $output .= $this->nextItem( $prefix[$commonPrefixLength - 1] );
254 }
255
256 # Open prefixes where appropriate.
257 if ( $lastPrefix && $prefixLength > $commonPrefixLength ) {
258 $output .= "\n";
259 }
260 while ( $prefixLength > $commonPrefixLength ) {
261 $char = substr( $prefix, $commonPrefixLength, 1 );
262 $output .= $this->openList( $char );
263
264 if ( ';' === $char ) {
265 # @todo FIXME: This is dupe of code above
266 if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) {
267 $t = $t2;
268 $output .= $term . $this->nextItem( ':' );
269 }
270 }
271 ++$commonPrefixLength;
272 }
273 if ( !$prefixLength && $lastPrefix ) {
274 $output .= "\n";
275 }
276 $lastPrefix = $prefix2;
277 }
278
279 # If we have no prefixes, go to paragraph mode.
280 if ( 0 == $prefixLength ) {
281 # No prefix (not in list)--go to paragraph mode
282 # XXX: use a stack for nestable elements like span, table and div
283 $openmatch = preg_match(
284 '/(?:<table|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|'
285 . '<p|<ul|<ol|<dl|<li|<\\/tr|<\\/td|<\\/th)/iS',
286 $t
287 );
288 $closematch = preg_match(
289 '/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'
290 . '<td|<th|<\\/?blockquote|<\\/?div|<hr|<\\/pre|<\\/p|<\\/mw:|'
291 . Parser::MARKER_PREFIX
292 . '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)/iS',
293 $t
294 );
295
296 if ( $openmatch || $closematch ) {
297 $paragraphStack = false;
298 # @todo bug 5718: paragraph closed
299 $output .= $this->closeParagraph();
300 if ( $preOpenMatch && !$preCloseMatch ) {
301 $this->mInPre = true;
302 }
303 $bqOffset = 0;
304 while ( preg_match( '/<(\\/?)blockquote[\s>]/i', $t,
305 $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset )
306 ) {
307 $inBlockquote = !$bqMatch[1][0]; // is this a close tag?
308 $bqOffset = $bqMatch[0][1] + strlen( $bqMatch[0][0] );
309 }
310 $inBlockElem = !$closematch;
311 } elseif ( !$inBlockElem && !$this->mInPre ) {
312 if ( ' ' == substr( $t, 0, 1 )
313 && ( $this->mLastSection === 'pre' || trim( $t ) != '' )
314 && !$inBlockquote
315 ) {
316 # pre
317 if ( $this->mLastSection !== 'pre' ) {
318 $paragraphStack = false;
319 $output .= $this->closeParagraph() . '<pre>';
320 $this->mLastSection = 'pre';
321 }
322 $t = substr( $t, 1 );
323 } else {
324 # paragraph
325 if ( trim( $t ) === '' ) {
326 if ( $paragraphStack ) {
327 $output .= $paragraphStack . '<br />';
328 $paragraphStack = false;
329 $this->mLastSection = 'p';
330 } else {
331 if ( $this->mLastSection !== 'p' ) {
332 $output .= $this->closeParagraph();
333 $this->mLastSection = '';
334 $paragraphStack = '<p>';
335 } else {
336 $paragraphStack = '</p><p>';
337 }
338 }
339 } else {
340 if ( $paragraphStack ) {
341 $output .= $paragraphStack;
342 $paragraphStack = false;
343 $this->mLastSection = 'p';
344 } elseif ( $this->mLastSection !== 'p' ) {
345 $output .= $this->closeParagraph() . '<p>';
346 $this->mLastSection = 'p';
347 }
348 }
349 }
350 }
351 }
352 # somewhere above we forget to get out of pre block (bug 785)
353 if ( $preCloseMatch && $this->mInPre ) {
354 $this->mInPre = false;
355 }
356 if ( $paragraphStack === false ) {
357 $output .= $t;
358 if ( $prefixLength === 0 ) {
359 $output .= "\n";
360 }
361 }
362 }
363 while ( $prefixLength ) {
364 $output .= $this->closeList( $prefix2[$prefixLength - 1] );
365 --$prefixLength;
366 if ( !$prefixLength ) {
367 $output .= "\n";
368 }
369 }
370 if ( $this->mLastSection != '' ) {
371 $output .= '</' . $this->mLastSection . '>';
372 $this->mLastSection = '';
373 }
374
375 return $output;
376 }
377
378 /**
379 * Split up a string on ':', ignoring any occurrences inside tags
380 * to prevent illegal overlapping.
381 *
382 * @param string $str The string to split
383 * @param string &$before Set to everything before the ':'
384 * @param string &$after Set to everything after the ':'
385 * @throws MWException
386 * @return string The position of the ':', or false if none found
387 */
388 private function findColonNoLinks( $str, &$before, &$after ) {
389 $pos = strpos( $str, ':' );
390 if ( $pos === false ) {
391 # Nothing to find!
392 return false;
393 }
394
395 $lt = strpos( $str, '<' );
396 if ( $lt === false || $lt > $pos ) {
397 # Easy; no tag nesting to worry about
398 $before = substr( $str, 0, $pos );
399 $after = substr( $str, $pos + 1 );
400 return $pos;
401 }
402
403 # Ugly state machine to walk through avoiding tags.
404 $state = self::COLON_STATE_TEXT;
405 $stack = 0;
406 $len = strlen( $str );
407 for ( $i = 0; $i < $len; $i++ ) {
408 $c = $str[$i];
409
410 switch ( $state ) {
411 # (Using the number is a performance hack for common cases)
412 case 0: # self::COLON_STATE_TEXT:
413 switch ( $c ) {
414 case "<":
415 # Could be either a <start> tag or an </end> tag
416 $state = self::COLON_STATE_TAGSTART;
417 break;
418 case ":":
419 if ( $stack == 0 ) {
420 # We found it!
421 $before = substr( $str, 0, $i );
422 $after = substr( $str, $i + 1 );
423 return $i;
424 }
425 # Embedded in a tag; don't break it.
426 break;
427 default:
428 # Skip ahead looking for something interesting
429 $colon = strpos( $str, ':', $i );
430 if ( $colon === false ) {
431 # Nothing else interesting
432 return false;
433 }
434 $lt = strpos( $str, '<', $i );
435 if ( $stack === 0 ) {
436 if ( $lt === false || $colon < $lt ) {
437 # We found it!
438 $before = substr( $str, 0, $colon );
439 $after = substr( $str, $colon + 1 );
440 return $i;
441 }
442 }
443 if ( $lt === false ) {
444 # Nothing else interesting to find; abort!
445 # We're nested, but there's no close tags left. Abort!
446 break 2;
447 }
448 # Skip ahead to next tag start
449 $i = $lt;
450 $state = self::COLON_STATE_TAGSTART;
451 }
452 break;
453 case 1: # self::COLON_STATE_TAG:
454 # In a <tag>
455 switch ( $c ) {
456 case ">":
457 $stack++;
458 $state = self::COLON_STATE_TEXT;
459 break;
460 case "/":
461 # Slash may be followed by >?
462 $state = self::COLON_STATE_TAGSLASH;
463 break;
464 default:
465 # ignore
466 }
467 break;
468 case 2: # self::COLON_STATE_TAGSTART:
469 switch ( $c ) {
470 case "/":
471 $state = self::COLON_STATE_CLOSETAG;
472 break;
473 case "!":
474 $state = self::COLON_STATE_COMMENT;
475 break;
476 case ">":
477 # Illegal early close? This shouldn't happen D:
478 $state = self::COLON_STATE_TEXT;
479 break;
480 default:
481 $state = self::COLON_STATE_TAG;
482 }
483 break;
484 case 3: # self::COLON_STATE_CLOSETAG:
485 # In a </tag>
486 if ( $c === ">" ) {
487 $stack--;
488 if ( $stack < 0 ) {
489 wfDebug( __METHOD__ . ": Invalid input; too many close tags\n" );
490 return false;
491 }
492 $state = self::COLON_STATE_TEXT;
493 }
494 break;
495 case self::COLON_STATE_TAGSLASH:
496 if ( $c === ">" ) {
497 # Yes, a self-closed tag <blah/>
498 $state = self::COLON_STATE_TEXT;
499 } else {
500 # Probably we're jumping the gun, and this is an attribute
501 $state = self::COLON_STATE_TAG;
502 }
503 break;
504 case 5: # self::COLON_STATE_COMMENT:
505 if ( $c === "-" ) {
506 $state = self::COLON_STATE_COMMENTDASH;
507 }
508 break;
509 case self::COLON_STATE_COMMENTDASH:
510 if ( $c === "-" ) {
511 $state = self::COLON_STATE_COMMENTDASHDASH;
512 } else {
513 $state = self::COLON_STATE_COMMENT;
514 }
515 break;
516 case self::COLON_STATE_COMMENTDASHDASH:
517 if ( $c === ">" ) {
518 $state = self::COLON_STATE_TEXT;
519 } else {
520 $state = self::COLON_STATE_COMMENT;
521 }
522 break;
523 default:
524 throw new MWException( "State machine error in " . __METHOD__ );
525 }
526 }
527 if ( $stack > 0 ) {
528 wfDebug( __METHOD__ . ": Invalid input; not enough close tags (stack $stack, state $state)\n" );
529 return false;
530 }
531 return false;
532 }
533 }