From 70105f8b85195a4a9febbe410ccc6158aa7e03fb Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Thu, 15 Sep 2011 12:10:53 +0000 Subject: [PATCH] Reverted r85922 and related: new doTableStuff(). I copied in the old doTableStuff() from before r85922 and reverted all parser test changes that looked vaguely related. Apologies to Platonides, since some of his parser tests appeared to be relevant to the old parser, but it's simplest to just revert all the related changes and then re-add any useful tests later. See CR r85922 for full rationale. --- CREDITS | 1 - includes/Sanitizer.php | 2 +- includes/parser/Parser.php | 389 +++++++------------ tests/parser/parserTests.txt | 720 +++++------------------------------ 4 files changed, 226 insertions(+), 886 deletions(-) diff --git a/CREDITS b/CREDITS index 5c64336fff..a792447065 100644 --- a/CREDITS +++ b/CREDITS @@ -83,7 +83,6 @@ following names for their contribution to the product. * Azliq7 * Beau * Bergi -* Bluehairedlawyer * Borislav Manolov * Brad Jorsch * Brent G diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 5080582621..227fa7c0cb 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -379,7 +379,7 @@ class Sanitizer { 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn', - 'kbd', 'samp', 'thead', 'tbody', 'tfoot' + 'kbd', 'samp' ); $htmlsingle = array( 'br', 'hr', 'li', 'dt', 'dd' diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 744f2e62ef..0d3e2a2b80 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -841,318 +841,195 @@ class Parser { * parse the wiki syntax used to render tables * * @private - * - * @param $text string - * - * @return string */ function doTableStuff( $text ) { wfProfileIn( __METHOD__ ); $lines = StringUtils::explode( "\n", $text ); $out = ''; - $output =& $out; + $td_history = array(); # Is currently a td tag open? + $last_tag_history = array(); # Save history of last lag activated (td, th or caption) + $tr_history = array(); # Is currently a tr tag open? + $tr_attributes = array(); # history of tr attributes + $has_opened_tr = array(); # Did this table open a element? + $indent_level = 0; # indent level of the table foreach ( $lines as $outLine ) { $line = trim( $outLine ); - # empty line, go to next line, - # but only append \n if outside of table - if ( $line === '' ) { - $output .= $outLine; - if ( !isset( $tables[0] ) ) { - $output .= "\n"; - } + if ( $line === '' ) { # empty line, go to next line + $out .= $outLine."\n"; continue; } - $firstChars = $line[0]; - if ( strlen( $line ) > 1 ) { - $firstChars .= in_array( $line[1], array( '}', '+', '-' ) ) ? $line[1] : ''; - } + + $first_character = $line[0]; $matches = array(); - if ( preg_match( '/^(:*)\s*\{\|(.*)$/', $line , $matches ) ) { - $tables[] = array(); - $table =& $this->last( $tables ); - $table[0] = array(); // first row - $currentRow =& $table[0]; - $table['indent'] = strlen( $matches[1] ); + if ( preg_match( '/^(:*)\{\|(.*)$/', $line , $matches ) ) { + # First check if we are starting a new table + $indent_level = strlen( $matches[1] ); $attributes = $this->mStripState->unstripBoth( $matches[2] ); $attributes = Sanitizer::fixTagAttributes( $attributes , 'table' ); - if ( $attributes !== '' ) { - $table['attributes'] = $attributes; - } - } elseif ( !isset( $tables[0] ) ) { - // we're outside the table - - $out .= $outLine . "\n"; - } elseif ( $firstChars === '|}' ) { - // trim the |} code from the line - $line = substr ( $line , 2 ); - - // Shorthand for last row - $lastRow =& $this->last( $table ); - - // a thead at the end becomes a tfoot, unless there is only one row - // Do this before deleting empty last lines to allow headers at the bottom of tables - if ( isset( $lastRow['type'] ) && $lastRow['type'] == 'thead' && isset( $table[1] ) ) { - $lastRow['type'] = 'tfoot'; - for ( $i = 0; isset( $lastRow[$i] ); $i++ ) { - $lastRow[$i]['type'] = 'th'; - } - } + $outLine = str_repeat( '
' , $indent_level ) . ""; + array_push( $td_history , false ); + array_push( $last_tag_history , '' ); + array_push( $tr_history , false ); + array_push( $tr_attributes , '' ); + array_push( $has_opened_tr , false ); + } elseif ( count( $td_history ) == 0 ) { + # Don't do any of the following + $out .= $outLine."\n"; + continue; + } elseif ( substr( $line , 0 , 2 ) === '|}' ) { + # We are ending a table + $line = '' . substr( $line , 2 ); + $last_tag = array_pop( $last_tag_history ); - // Delete empty last lines - if ( empty( $lastRow ) ) { - $lastRow = NULL; + if ( !array_pop( $has_opened_tr ) ) { + $line = "{$line}"; } - $o = ''; - $curtable = array_pop( $tables ); - #Add a line-ending before the table, but only if there isn't one already - if ( substr( $out, -1 ) !== "\n" ) { - $o .= "\n"; + if ( array_pop( $tr_history ) ) { + $line = "{$line}"; } - $o .= $this->generateTableHTML( $curtable ) . $line . "\n"; - if ( count( $tables ) > 0 ) { - $table =& $this->last( $tables ); - $currentRow =& $this->last( $table ); - $currentElement =& $this->last( $currentRow ); - - $output =& $currentElement['content']; - } else { - $output =& $out; + if ( array_pop( $td_history ) ) { + $line = "{$line}"; } + array_pop( $tr_attributes ); + $outLine = $line . str_repeat( '
' , $indent_level ); + } elseif ( substr( $line , 0 , 2 ) === '|-' ) { + # Now we have a table row + $line = preg_replace( '#^\|-+#', '', $line ); - $output .= $o; - - } elseif ( $firstChars === '|-' ) { - // start a new row element - // but only when we haven't started one already - if ( count( $currentRow ) != 0 ) { - $table[] = array(); - $currentRow =& $this->last( $table ); - } - // Get the attributes, there's nothing else useful in $line now - $line = substr ( $line , 2 ); + # Whats after the tag is now only attributes $attributes = $this->mStripState->unstripBoth( $line ); $attributes = Sanitizer::fixTagAttributes( $attributes, 'tr' ); - if ( $attributes !== '' ) { - $currentRow['attributes'] = $attributes; - } + array_pop( $tr_attributes ); + array_push( $tr_attributes, $attributes ); - } elseif ( $firstChars === '|+' ) { - // a table caption, but only proceed if there isn't one already - if ( !isset ( $table['caption'] ) ) { - $line = substr ( $line , 2 ); - - $c = $this->getCellAttr( $line , 'caption' ); - $table['caption'] = array(); - $table['caption']['content'] = $c[0]; - if ( isset( $c[1] ) ) $table['caption']['attributes'] = $c[1]; - unset( $c ); - $output =& $table['caption']['content']; - } - } elseif ( $firstChars === '|' || $firstChars === '!' || $firstChars === '!+' ) { - // Which kind of cells are we dealing with - $currentTag = 'td'; - $line = substr ( $line , 1 ); - - if ( $firstChars === '!' || $firstChars === '!+' ) { - $line = str_replace ( '!!' , '||' , $line ); - $currentTag = 'th'; - } + $line = ''; + $last_tag = array_pop( $last_tag_history ); + array_pop( $has_opened_tr ); + array_push( $has_opened_tr , true ); - // Split up multiple cells on the same line. - $cells = StringUtils::explodeMarkup( '||' , $line ); - $line = ''; // save memory + if ( array_pop( $tr_history ) ) { + $line = ''; + } - // decide whether thead to tbody - if ( !array_key_exists( 'type', $currentRow ) ) { - $currentRow['type'] = ( $firstChars === '!' ) ? 'thead' : 'tbody' ; - } elseif ( $firstChars === '|' ) { - $currentRow['type'] = 'tbody'; + if ( array_pop( $td_history ) ) { + $line = "{$line}"; } - // Loop through each table cell - foreach ( $cells as $cell ) { - // a new cell - $currentRow[] = array(); - $currentElement =& $this->last( $currentRow ); + $outLine = $line; + array_push( $tr_history , false ); + array_push( $td_history , false ); + array_push( $last_tag_history , '' ); + } elseif ( $first_character === '|' || $first_character === '!' || substr( $line , 0 , 2 ) === '|+' ) { + # This might be cell elements, td, th or captions + if ( substr( $line , 0 , 2 ) === '|+' ) { + $first_character = '+'; + $line = substr( $line , 1 ); + } - $currentElement['type'] = $currentTag; + $line = substr( $line , 1 ); - $c = $this->getCellAttr( $cell , $currentTag ); - $currentElement['content'] = $c[0]; - if ( isset( $c[1] ) ) $currentElement['attributes'] = $c[1]; - unset( $c ); + if ( $first_character === '!' ) { + $line = str_replace( '!!' , '||' , $line ); } - $output =& $currentElement['content']; - } else { - $output .= "\n$outLine"; - } - } + # Split up multiple cells on the same line. + # FIXME : This can result in improper nesting of tags processed + # by earlier parser steps, but should avoid splitting up eg + # attribute values containing literal "||". + $cells = StringUtils::explodeMarkup( '||' , $line ); - # Remove trailing line-ending (b/c) - if ( substr( $out, -1 ) === "\n" ) { - $out = substr( $out, 0, -1 ); - } + $outLine = ''; - # Close any unclosed tables - if ( isset( $tables ) && count( $tables ) > 0 ) { - for ( $i = 0; $i < count( $tables ); $i++ ) { - $curtable = array_pop( $tables ); - $curtable = $this->generateTableHTML( $curtable ); - #Add a line-ending before the table, but only if there isn't one already - if ( substr( $out, -1 ) !== "\n" && $curtable !== "" ) { - $out .= "\n"; - } - $out .= $curtable; - } - } + # Loop through each table cell + foreach ( $cells as $cell ) { + $previous = ''; + if ( $first_character !== '+' ) { + $tr_after = array_pop( $tr_attributes ); + if ( !array_pop( $tr_history ) ) { + $previous = "\n"; + } + array_push( $tr_history , true ); + array_push( $tr_attributes , '' ); + array_pop( $has_opened_tr ); + array_push( $has_opened_tr , true ); + } - wfProfileOut( __METHOD__ ); + $last_tag = array_pop( $last_tag_history ); - return $out; - } + if ( array_pop( $td_history ) ) { + $previous = "\n{$previous}"; + } - /** - * Helper function for doTableStuff() separating the contents of cells from - * attributes. Particularly useful as there's a possible bug and this action - * is repeated twice. - * - * @private - * @param $cell - * @param $tagName - * @return array - */ - function getCellAttr ( $cell, $tagName ) { - $attributes = null; + if ( $first_character === '|' ) { + $last_tag = 'td'; + } elseif ( $first_character === '!' ) { + $last_tag = 'th'; + } elseif ( $first_character === '+' ) { + $last_tag = 'caption'; + } else { + $last_tag = ''; + } - $cell = trim ( $cell ); + array_push( $last_tag_history , $last_tag ); - // A cell could contain both parameters and data - $cellData = explode ( '|' , $cell , 2 ); + # A cell could contain both parameters and data + $cell_data = explode( '|' , $cell , 2 ); - // Bug 553: Note that a '|' inside an invalid link should not - // be mistaken as delimiting cell parameters - if ( strpos( $cellData[0], '[[' ) !== false ) { - $content = trim ( $cell ); - } - elseif ( count ( $cellData ) == 1 ) { - $content = trim ( $cellData[0] ); - } else { - $attributes = $this->mStripState->unstripBoth( $cellData[0] ); - $attributes = Sanitizer::fixTagAttributes( $attributes , $tagName ); + # Bug 553: Note that a '|' inside an invalid link should not + # be mistaken as delimiting cell parameters + if ( strpos( $cell_data[0], '[[' ) !== false ) { + $cell = "{$previous}<{$last_tag}>{$cell}"; + } elseif ( count( $cell_data ) == 1 ) { + $cell = "{$previous}<{$last_tag}>{$cell_data[0]}"; + } else { + $attributes = $this->mStripState->unstripBoth( $cell_data[0] ); + $attributes = Sanitizer::fixTagAttributes( $attributes , $last_tag ); + $cell = "{$previous}<{$last_tag}{$attributes}>{$cell_data[1]}"; + } - $content = trim ( $cellData[1] ); + $outLine .= $cell; + array_push( $td_history , true ); + } + } + $out .= $outLine . "\n"; } - return array( $content, $attributes ); - } - - /** - * Helper function for doTableStuff(). This converts the structured array into html. - * - * @private - * - * @param $table array - * - * @return string - */ - function generateTableHTML( &$table ) { - $return = str_repeat( '
' , $table['indent'] ); - $return .= ' 0 ) { + if ( array_pop( $td_history ) ) { + $out .= "\n"; } - if ( !$lastSection ) { - $lastSection = $table[$i]['type']; - } elseif ( $lastSection != $table[$i]['type'] ) { - $simple = false; + if ( array_pop( $tr_history ) ) { + $out .= "\n"; } - } - $lastSection = ''; - for ( $i = 0; isset( $table[$i] ); $i++ ) { - if ( !count( $table[$i] ) ) continue; - $empty = false; // check for empty tables - - if ( $table[$i]['type'] != $lastSection && !$simple ) { - $return .= "\n<" . $table[$i]['type'] . '>'; + if ( !array_pop( $has_opened_tr ) ) { + $out .= "\n" ; } - $return .= "\n'; - unset( $table[$i][$j] ); - } - $return .= "\n"; + $out .= "\n"; + } - if ( ( !isset( $table[$i + 1] ) && !$simple ) || ( isset( $table[$i + 1] ) && isset( $table[$i + 1]['type'] ) && $table[$i]['type'] != $table[$i + 1]['type'] ) ) { - $return .= ''; - } - $lastSection = $table[$i]['type']; - unset( $table[$i] ); + # Remove trailing line-ending (b/c) + if ( substr( $out, -1 ) === "\n" ) { + $out = substr( $out, 0, -1 ); } - if ( $empty ) { - if ( isset( $table['caption'] ) ) { - $return .= "\n"; - } else { - return ''; - } + + # special case: don't return empty table + if ( $out === "\n\n
" ) { + $out = ''; } - $return .= "\n"; - $return .= str_repeat( '
' , $table['indent'] ); - return $return; - } + wfProfileOut( __METHOD__ ); - /** - * like end() but only works on the numeric array index and php's internal pointers - * returns a reference to the last element of an array much like "\$arr[-1]" in perl - * ignores associative elements and will create a 0 key will a NULL value if there were - * no numric elements and an array itself if not previously defined. - * - * @private - * - * @param $arr array - */ - function &last ( &$arr ) { - for ( $i = count( $arr ); ( !isset( $arr[$i] ) && $i > 0 ); $i-- ) { } - return $arr[$i]; + return $out; } /** diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index 5781e585f8..8ed3f2a2bc 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -1246,10 +1246,8 @@ A table with nothing but a caption |} !! result - - -
caption -
+ caption + !! end @@ -1264,338 +1262,13 @@ Simple table !! result - - - - - - - -
1 -2 -
3 -4 -
- -!! end - -!! test -Table inside unclosed table w/o cells -!! input -{| -{| -| foo bar -|} - -!! result - - - - -
- - - - -
foo bar -
-
- -!! end - -!! test -Table with thead -!! input -{| -! Number !! Another number -|- -| 1 || 2 -|- -| 3 || 4 -|} -!! result - - - - - - - - - - - - - - - -
Number -Another number -
1 -2 -
3 -4 -
- -!! end - -!! test -Table with multiple captions: Only keep first -!! input -{| -|+ caption 1 -|+ caption 2 -|} -!! result - - - -
caption 1 -
- -!! end - -!! test -Table with multiline caption -!! input -{| -|+ caption 1 -further caption -|} -!! result - - - -
caption 1 -further caption -
- -!! end -!! test -Table with multiple thead -!! input -{| -! Number !! Another number -|- -| 1 || 2 -|- -! Some other number !! Another number -|- -| 3 || 4 -|} -!! result - - - - - - - - - - - - - - - - - - - - - -
Number -Another number -
1 -2 -
Some other number -Another number -
3 -4 -
- -!! end -!! test -Table with thead & tfoot -!! input -{| -! Number !! Another number -|- -| 1 || 2 -|- -! Some other number !! Another number -|- -| 3 || 4 -|- -! Total: 4 !! Total: 6 -|} -!! result - - - - - - - - - - - - - - - - - - - - - - - - - - -
Number -Another number -
1 -2 -
Some other number -Another number -
3 -4 -
Total: 4 -Total: 6 -
- -!! end - -!! test -Table have th inside tfoot -!! input -{| -| cell1 || cell2 -|- -! Footer1 !! Footer2 -|} -!! result - - - - - - - - - - - -
cell1 -cell2 -
Footer1 -Footer2 -
- -!! end - -!! test -Table have th inside thead -!! input -{| -! Header1 !! Header2 -|- -| cell1 || cell2 -|} -!! result - - - - - - - + + - - - -
Header1 -Header2 -
1 2 +
cell1 -cell2 -
- -!! end - -!! test -Table with list inside -!! input -{| -|style="width: 5em; text-align: center"| gives -|style="border: 1px dashed #2F6FAB; padding: 0.5em; margin: 0.5em"| -# Some -# list -# Lorem -# ipsum -# dolor -|} -!! result - - - - - -
gives - -
  1. Some -
  2. list -
  3. Lorem -
  4. ipsum -
  5. dolor -
-
- -!! end -!! test -Indented table wrapped in html tags (Related to Bug 26362) -!! input -
-:{| -|- -| test -|}
- -!! result -
-
- - - -
test -
- -!! end - -!! test -Table with multiline contents -!! input -{| -| Alice -Bob -dfdfg -dfg -|} -!! result - - - - -
Alice -

Bob -dfdfg -dfg -

-
+ 3 + 4 + !! end @@ -1626,69 +1299,47 @@ Multiplication table - - - - - - - + + + + - - - - - + + + - - - - - + + + - - - - - + + + - - - - - + + + - - - - - -
Multiplication table
× -1 -2 -3 -
× 1 2 3 +
1 + 1 1 -2 -3 -
1 2 3 +
2 + 2 2 -4 -6 -
2 4 6 +
3 + 3 3 -6 -9 -
3 6 9 +
4 + 4 4 -8 -12 -
4 8 12 +
5 + 5 5 -10 -15 -
+ 5 + 10 + 15 + !! end @@ -1706,20 +1357,17 @@ Table rowspan !! result - - - - + - - - -
Cell 1, row 1 + Cell 1, row 1 Cell 2, row 1 (and 2) + Cell 2, row 1 (and 2) Cell 3, row 1 -
Cell 3, row 1 +
Cell 1, row 2 -Cell 3, row 2 + Cell 1, row 2
+ Cell 3, row 2 + !! end @@ -1739,24 +1387,19 @@ Nested table !! result -
α + α - + - -
nested -
table -
+
the original table again - - - + !! end @@ -1770,85 +1413,10 @@ Invalid attributes in table cell (bug 1830) - -
broken -
- -!! end - -!! test -Heading inside table (affected by r85922) -!! input -{| -|- valign="top" -| -=== Heading === -|} -!! result - - - - -
-

[edit] Heading

-
- -!! end - -!! test -A table with a caption with unclosed italic -!! input -{| -|+ ''caption -| Cell -|} -!! result - - - - - -
caption -
Cell -
- -!! end - -!! test -A table with unclosed italic in a cell -!! input -{| -| ''Cell -|} -!! result - - - - -
Cell -
+ !! end -!! test -A table with unclosed italic in a th -!! input -{| -|- -! ''Cell -|| Value -|} -!! result - - - - - -
Cell -Value -
- -!! end !! test Table security: embedded pipes (http://lists.wikimedia.org/mailman/htdig/wikitech-l/2006-April/022293.html) @@ -1858,8 +1426,7 @@ Table security: embedded pipes (http://lists.wikimedia.org/mailman/htdig/wikitec !! result - + @@ -1867,70 +1434,6 @@ Table security: embedded pipes (http://lists.wikimedia.org/mailman/htdig/wikitec !! end -!! test -Indented Tables, bug 20078 -!! input -: {| -| 1 || 2 -|- -| 3 || 4 -|} -!! result -
[ftp://%7Cx -[ftp://%7Cx ]" onmouseover="alert(document.cookie)">test
- - - - - - - - -
1 -2 -
3 -4 -
- -!! end - -!! test -Arbitrary whitespace should not be prepended -!! input -{| -| 1 || 2 - -|- - - -| 3 || 4 -|- - -| 6 || 8 -|} -!! result - - - - - - - - - - - - - -
1 -2 -
3 -4 -
6 -8 -
- -!! end - ### ### Internal links @@ -3220,9 +2723,7 @@ BUG 553: link with two variables in a piped link - -
[[{{{1}}}|{{{2}}}]] -
+ !! end @@ -3331,18 +2832,13 @@ foo {{table}}

- - - + + - - - -
1 -2 -
1 2 +
3 -4 -
+ 3 + 4 + !! end @@ -3356,18 +2852,13 @@ foo

- - - + + - - - -
1 -2 -
1 2 +
3 -4 -
+ 3 + 4 + !! end @@ -5050,10 +4541,8 @@ Table multiple attributes correction !! result - - -
status -
+ status + !!end @@ -5497,10 +4986,8 @@ Table attribute legitimate extension !! result - - -
status -
+ status + !!end @@ -5513,10 +5000,8 @@ Table attribute safety !! result - - -
status -
+ status + !! end @@ -6138,7 +5623,8 @@ Fuzz testing: Parser13 !! result - +
+
@@ -6164,14 +5650,10 @@ Fuzz testing: Parser14-table !! input ==a== {| STYLE=__TOC__ -|foo !! result

[edit] a

- - - +
foo -
!! end @@ -6187,11 +5669,11 @@ noxml !! result - - + +
https:// -https:// +
@@ -6206,9 +5688,10 @@ Fuzz testing: Parser21 !! result - - +
irc://{{ftp://a" onmouseover="alert('hello world');" + irc://{{ftp://a" onmouseover="alert('hello world');" +
@@ -6220,29 +5703,15 @@ Fuzz testing: Parser22 http://===r:::https://b {| - !!result

http://===r:::https://b

-!! end + + +
-# Known to produce bad XML for now +!! end -# Note: the current result listed for this is not what the original one was, -# but the original bug was JavaScript injection, which is fixed in any case. -# It's not clear that the original result listed was any more correct than the -# current one. Original result: -# -# {{{| -# }}}} > -#
-# -# MOVE YOUR MOUSE CURSOR OVER THIS TEXT -#
-# -# -#
-#
# Known to produce bad XML for now !! test Fuzz testing: Parser24 @@ -6258,12 +5727,12 @@ noxml MOVE YOUR MOUSE CURSOR OVER THIS TEXT | !! result -

{{{| + +{{{| }}}} >
+ MOVE YOUR MOUSE CURSOR OVER THIS TEXT -

-
@@ -8472,18 +7941,13 @@ y

- - - + + - - - -
1 -2 -
1 2 +
3 -4 -
+ 3 + 4 +

y

!! end -- 2.20.1