From: C. Scott Ananian Date: Tue, 14 Jun 2016 21:59:20 +0000 (-0400) Subject: Hook up Balancer as a Tidy implementation. X-Git-Tag: 1.31.0-rc.0~6397 X-Git-Url: http://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/exercices/?a=commitdiff_plain;h=ce081a3d7b5b24f39beb9955ccd5a9d5e79cfa86;p=lhc%2Fweb%2Fwiklou.git Hook up Balancer as a Tidy implementation. This is an HTML5-compliant parse/serialize tidy implementation, with well-delineated hacks to support the

-wrapping done by legacy tidy. Change-Id: I4fd433fd6f1847061b0bf4b3e249c918720d4fae --- diff --git a/autoload.php b/autoload.php index eb0ce13719..f6b337290c 100644 --- a/autoload.php +++ b/autoload.php @@ -886,6 +886,7 @@ $wgAutoloadLocalClasses = [ 'MediaWiki\\Tidy\\BalanceStack' => __DIR__ . '/includes/tidy/Balancer.php', 'MediaWiki\\Tidy\\Balancer' => __DIR__ . '/includes/tidy/Balancer.php', 'MediaWiki\\Tidy\\Html5Depurate' => __DIR__ . '/includes/tidy/Html5Depurate.php', + 'MediaWiki\\Tidy\\Html5Internal' => __DIR__ . '/includes/tidy/Html5Internal.php', 'MediaWiki\\Tidy\\RaggettBase' => __DIR__ . '/includes/tidy/RaggettBase.php', 'MediaWiki\\Tidy\\RaggettExternal' => __DIR__ . '/includes/tidy/RaggettExternal.php', 'MediaWiki\\Tidy\\RaggettInternalHHVM' => __DIR__ . '/includes/tidy/RaggettInternalHHVM.php', diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 722dfe378e..f3c50739f0 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -4216,6 +4216,8 @@ $wgAllowImageTag = false; * - RaggettInternalHHVM: Use the limited-functionality HHVM extension * - RaggettInternalPHP: Use the PECL extension * - RaggettExternal: Shell out to an external binary (tidyBin) + * - Html5Depurate: Use external Depurate service + * - Html5Internal: Use the built-in HTML5 balancer * * - tidyConfigFile: Path to configuration file for any of the Raggett drivers * - debugComment: True to add a comment to the output with warning messages diff --git a/includes/parser/MWTidy.php b/includes/parser/MWTidy.php index bdf3efba56..f281c25b3c 100644 --- a/includes/parser/MWTidy.php +++ b/includes/parser/MWTidy.php @@ -132,6 +132,9 @@ class MWTidy { case 'Html5Depurate': self::$instance = new MediaWiki\Tidy\Html5Depurate( $config ); break; + case 'Html5Internal': + self::$instance = new MediaWiki\Tidy\Html5Internal( $config ); + break; default: throw new MWException( "Invalid tidy driver: \"{$config['driver']}\"" ); } diff --git a/includes/tidy/Balancer.php b/includes/tidy/Balancer.php index 828b09b4f6..ba5e08e9ba 100644 --- a/includes/tidy/Balancer.php +++ b/includes/tidy/Balancer.php @@ -242,6 +242,34 @@ class BalanceSets { 'title' => true ] ]; + + // For tidy compatibility. + public static $tidyPWrapSet = [ + self::HTML_NAMESPACE => [ + 'body' => true, 'blockquote' => true, + // We parse with as the fragment context, but the top-level + // element on the stack is actually . We could use the + // "adjusted current node" everywhere to work around this, but it's + // easier just to add to the p-wrap set. + 'html' => true, + ], + ]; + public static $tidyInlineSet = [ + self::HTML_NAMESPACE => [ + 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true, + 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true, + 'br' => true, 'button' => true, 'cite' => true, 'code' => true, + 'dfn' => true, 'em' => true, 'font' => true, 'i' => true, + 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true, + 'label' => true, 'legend' => true, 'map' => true, 'object' => true, + 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true, + 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true, + 's' => true, 'samp' => true, 'select' => true, 'small' => true, + 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true, + 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true, + 'var' => true, + ], + ]; } /** @@ -405,14 +433,37 @@ class BalanceElement { * * @see __toString() */ - public function flatten() { + public function flatten( $tidyCompat = false ) { Assert::parameter( $this->parent !== null, '$this', 'must be a child' ); Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' ); $idx = array_search( $this, $this->parent->children, true ); Assert::parameter( $idx !== false, '$this', 'must be a child of its parent' ); - $flat = "{$this}"; + if ( $tidyCompat ) { + $blank = true; + foreach ( $this->children as $elt ) { + if ( !is_string( $elt ) ) { + $elt = $elt->flatten( $tidyCompat ); + } + if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) { + $blank = false; + } + } + if ( $this->isA( 'mw:p-wrap' ) ) { + $this->localName = 'p'; + } elseif ( $blank ) { + // Add 'mw-empty-elt' class so elements can be hidden via CSS + // for compatibility with legacy tidy. + if ( $this->attribs === '' ) { + $this->attribs = ' class="mw-empty-elt"'; + } + $blank = false; + } + $flat = $blank ? '' : "{$this}"; + } else { + $flat = "{$this}"; + } $this->parent->children[$idx] = $flat; $this->parent = 'flat'; # for assertion checking return $flat; @@ -537,6 +588,10 @@ class BalanceStack implements IteratorAggregate { * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent */ public $fosterParentMode = false; + /** + * Tidy compatibility mode, determines behavior of body/blockquote + */ + public $tidyCompat = false; /** * Create a new BalanceStack with a single BalanceElement on it, @@ -559,7 +614,8 @@ class BalanceStack implements IteratorAggregate { // Don't include the outer '....' $out = ''; foreach ( $this->elements[0]->children as $elt ) { - $out .= is_string( $elt ) ? $elt : $elt->flatten(); + $out .= is_string( $elt ) ? $elt : + $elt->flatten( $this->tidyCompat ); } return $out; } @@ -576,6 +632,12 @@ class BalanceStack implements IteratorAggregate { $this->currentNode()->isA( BalanceSets::$tableSectionRowSet ) ) { $this->fosterParent( $value ); + } elseif ( + $this->tidyCompat && + $this->currentNode()->isA( BalanceSets::$tidyPWrapSet ) + ) { + $this->insertHTMLELement( 'mw:p-wrap', '' ); + return $this->insertText( $value ); } else { $this->currentNode()->appendChild( $value ); } @@ -619,6 +681,13 @@ class BalanceStack implements IteratorAggregate { */ public function insertElement( $elt ) { Assert::parameterType( 'MediaWiki\Tidy\BalanceElement', $elt, '$elt' ); + if ( + $this->currentNode()->isA( 'mw:p-wrap' ) && + !$elt->isA( BalanceSets::$tidyInlineSet ) + ) { + // Tidy compatibility. + $this->pop(); + } if ( $this->fosterParentMode && $this->currentNode()->isA( BalanceSets::$tableSectionRowSet ) @@ -797,7 +866,9 @@ class BalanceStack implements IteratorAggregate { */ public function pop() { $elt = array_pop( $this->elements ); - $elt->flatten(); + if ( !$elt->isA( 'mw:p-wrap' ) ) { + $elt->flatten( $this->tidyCompat ); + } } /** @@ -868,7 +939,7 @@ class BalanceStack implements IteratorAggregate { // otherwise, it will eventually serialize when the parent // is serialized, we just hold onto the memory for its // tree of objects a little longer. - $elt->flatten(); + $elt->flatten( $this->tidyCompat ); } Assert::postcondition( array_search( $elt, $this->elements, true ) === false, @@ -915,6 +986,32 @@ class BalanceStack implements IteratorAggregate { } else { $parent = $this->elements[0]; // the `html` element. } + + if ( $this->tidyCompat ) { + if ( is_string( $elt ) ) { + // We're fostering text: do we need a p-wrapper? + if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) { + $this->insertHTMLElement( 'mw:p-wrap', '' ); + $this->insertText( $elt ); + return $elt; + } + } else { + // We're fostering an element; do we need to merge p-wrappers? + if ( $elt->isA( 'mw:p-wrap' ) ) { + $idx = $before ? + array_search( $before, $parent->children, true ) : + count( $parent->children ); + $after = $idx > 0 ? $parent->children[$idx - 1] : ''; + if ( + $after instanceof BalanceElement && + $after->isA( 'mw:p-wrap' ) + ) { + return $after; // Re-use existing p-wrapper. + } + } + } + } + if ( $before ) { $parent->insertBefore( $before, $elt ); } else { @@ -1402,6 +1499,7 @@ class Balancer { private $afe; private $stack; private $strict; + private $tidyCompat; private $textIntegrationMode = false; private $pendingTableText; @@ -1420,14 +1518,22 @@ class Balancer { * When present, the keys of this associative array give * the acceptable HTML tag names. When not present, no * tag sanitization is done. + * 'tidyCompat' : boolean, defaults to false. + * When true, the serialization algorithm is tweaked to + * provide historical compatibility with the old "tidy" + * program:

-wrapping is done to the children of + * and

elements, and empty elements + * are removed. */ public function __construct( array $config ) { $config = $config + [ 'strict' => false, 'allowedHtmlElements' => null, + 'tidyCompat' => false, ]; $this->allowedHtmlElements = $config['allowedHtmlElements']; $this->strict = $config['strict']; + $this->tidyCompat = $config['tidyCompat']; if ( $this->allowedHtmlElements !== null ) { # Sanity check! $bad = array_uintersect_assoc( @@ -1467,6 +1573,7 @@ class Balancer { $this->bitsIterator = new ExplodeIterator( '<', $text ); $this->afe = new BalanceActiveFormattingElements(); $this->stack = new BalanceStack(); + $this->stack->tidyCompat = $this->tidyCompat; $this->processingCallback = $processingCallback; $this->processingArgs = $processingArgs; diff --git a/includes/tidy/Html5Internal.php b/includes/tidy/Html5Internal.php new file mode 100644 index 0000000000..4ad820021f --- /dev/null +++ b/includes/tidy/Html5Internal.php @@ -0,0 +1,18 @@ + true, + 'tidyCompat' => true, + ] ); + $this->balancer = new Balancer( $this->config ); + } + + public function tidy( $text ) { + return $this->balancer->balance( $text ); + } +} diff --git a/tests/phpunit/includes/tidy/BalancerTest.php b/tests/phpunit/includes/tidy/BalancerTest.php index 078080c688..451a4926fa 100644 --- a/tests/phpunit/includes/tidy/BalancerTest.php +++ b/tests/phpunit/includes/tidy/BalancerTest.php @@ -14,6 +14,7 @@ class BalancerTest extends MediaWikiTestCase { $this->balancer = new MediaWiki\Tidy\Balancer( [ 'strict' => false, /* not strict */ 'allowedHtmlElements' => null, /* no sanitization */ + 'tidyCompat' => false, /* standard parser */ ] ); }