From: Kunal Mehta Date: Mon, 7 May 2018 08:34:13 +0000 (-0700) Subject: tidy: Remove obsolete Depurate and Balancer drivers X-Git-Tag: 1.34.0-rc.0~5496^2 X-Git-Url: https://git.cyclocoop.org/%7B%24www_url%7Dadmin/compta/exercices/journal.php?a=commitdiff_plain;h=853b8fe34c717f44a03d4ed164732ee55f38bcb3;p=lhc%2Fweb%2Fwiklou.git tidy: Remove obsolete Depurate and Balancer drivers The Html5Depurate driver was intended to be used with an external Java service, but it never gained traction due to deployment concerns. The Html5Internal (Balancer) driver was originally intended for use with the balanced templates proposal and could also handle tidying. But it was tightly coupled to MediaWiki, so part of it was used as the basis of the RemexHtml library. Remex most likely can also implement the balanced templates proposal, so there isn't any reason to keep the Balancer code around anymore, Change-Id: I8542d69e9cdbf0e2fb7ebbb919933a64c1b8c293 --- diff --git a/RELEASE-NOTES-1.32 b/RELEASE-NOTES-1.32 index b7500aeb17..9fd3161f1e 100644 --- a/RELEASE-NOTES-1.32 +++ b/RELEASE-NOTES-1.32 @@ -15,6 +15,8 @@ production. $wgJpegQuality (default 80). This aligns the quality to what ImageMagick uses. * $wgExperimentalHtmlIds, deprecated since 1.30, has been removed. The 'html5-legacy' value for $wgFragmentMode is no longer accepted. +* The experimental Html5Internal and Html5Depurate tidy drivers were removed. + RemexHtml, which is the default, should be used instead. === New features in 1.32 === * (T112474) Generalized the ResourceLoader mechanism for overriding modules @@ -76,6 +78,9 @@ because of Phabricator reports. * mw.util.updateTooltipAccessKeys(), deprecated in 1.24, was removed. Use jquery.accessKeyLabel instead. * The SqlDataUpdate class, deprecated in 1.28, has been removed. +* The Html5Internal and Html5Depurate tidy driver classes were removed, along with the + Balancer tidy implementation. Both implementations were experimental, and were replaced + by RemexHtml. === Deprecations in 1.32 === * Use of a StartProfiler.php file is deprecated in favour of placing diff --git a/autoload.php b/autoload.php index e316bb521d..f4ae8f6535 100644 --- a/autoload.php +++ b/autoload.php @@ -970,14 +970,6 @@ $wgAutoloadLocalClasses = [ 'MediaWiki\\Storage\\SlotRecord' => __DIR__ . '/includes/Storage/SlotRecord.php', 'MediaWiki\\Storage\\SqlBlobStore' => __DIR__ . '/includes/Storage/SqlBlobStore.php', 'MediaWiki\\Storage\\SuppressedDataException' => __DIR__ . '/includes/Storage/SuppressedDataException.php', - 'MediaWiki\\Tidy\\BalanceActiveFormattingElements' => __DIR__ . '/includes/tidy/Balancer.php', - 'MediaWiki\\Tidy\\BalanceElement' => __DIR__ . '/includes/tidy/Balancer.php', - 'MediaWiki\\Tidy\\BalanceMarker' => __DIR__ . '/includes/tidy/Balancer.php', - 'MediaWiki\\Tidy\\BalanceSets' => __DIR__ . '/includes/tidy/Balancer.php', - 'MediaWiki\\Tidy\\BalanceStack' => __DIR__ . '/includes/tidy/Balancer.php', - 'MediaWiki\\Tidy\\Balancer' => __DIR__ . '/includes/tidy/Balancer.php', - 'MediaWiki\\Tidy\\Html5Depurate' => __DIR__ . '/includes/tidy/Html5Depurate.php', - 'MediaWiki\\Tidy\\Html5Internal' => __DIR__ . '/includes/tidy/Html5Internal.php', 'MediaWiki\\Tidy\\RaggettBase' => __DIR__ . '/includes/tidy/RaggettBase.php', 'MediaWiki\\Tidy\\RaggettExternal' => __DIR__ . '/includes/tidy/RaggettExternal.php', 'MediaWiki\\Tidy\\RaggettInternalHHVM' => __DIR__ . '/includes/tidy/RaggettInternalHHVM.php', diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 2dc43fe118..ff8612b3a7 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -4273,8 +4273,6 @@ $wgAllowImageTag = false; * - RaggettInternalHHVM: Use the limited-functionality HHVM extension * - RaggettInternalPHP: Use the PECL extension * - RaggettExternal: Shell out to an external binary (tidyBin) - * - Html5Depurate: Use external Depurate service - * - Html5Internal: Use the Balancer library in PHP * - RemexHtml: Use the RemexHtml library in PHP * * - tidyConfigFile: Path to configuration file for any of the Raggett drivers diff --git a/includes/parser/MWTidy.php b/includes/parser/MWTidy.php index 19cf573157..5788986f2e 100644 --- a/includes/parser/MWTidy.php +++ b/includes/parser/MWTidy.php @@ -111,12 +111,6 @@ class MWTidy { case 'RaggettExternal': $instance = new MediaWiki\Tidy\RaggettExternal( $config ); break; - case 'Html5Depurate': - $instance = new MediaWiki\Tidy\Html5Depurate( $config ); - break; - case 'Html5Internal': - $instance = new MediaWiki\Tidy\Html5Internal( $config ); - break; case 'RemexHtml': $instance = new MediaWiki\Tidy\RemexDriver( $config ); break; diff --git a/includes/tidy/Balancer.php b/includes/tidy/Balancer.php deleted file mode 100644 index 6671f49ba7..0000000000 --- a/includes/tidy/Balancer.php +++ /dev/null @@ -1,3584 +0,0 @@ - [ - 'html' => true, 'head' => true, 'body' => true, 'frameset' => true, - 'frame' => true, - 'plaintext' => true, - 'xmp' => true, 'iframe' => true, 'noembed' => true, - 'noscript' => true, 'script' => true, - 'title' => true - ] - ]; - - public static $emptyElementSet = [ - self::HTML_NAMESPACE => [ - 'area' => true, 'base' => true, 'basefont' => true, - 'bgsound' => true, 'br' => true, 'col' => true, 'command' => true, - 'embed' => true, 'frame' => true, 'hr' => true, 'img' => true, - 'input' => true, 'keygen' => true, 'link' => true, 'meta' => true, - 'param' => true, 'source' => true, 'track' => true, 'wbr' => true - ] - ]; - - public static $extraLinefeedSet = [ - self::HTML_NAMESPACE => [ - 'pre' => true, 'textarea' => true, 'listing' => true, - ] - ]; - - public static $headingSet = [ - self::HTML_NAMESPACE => [ - 'h1' => true, 'h2' => true, 'h3' => true, - 'h4' => true, 'h5' => true, 'h6' => true - ] - ]; - - public static $specialSet = [ - self::HTML_NAMESPACE => [ - 'address' => true, 'applet' => true, 'area' => true, - 'article' => true, 'aside' => true, 'base' => true, - 'basefont' => true, 'bgsound' => true, 'blockquote' => true, - 'body' => true, 'br' => true, 'button' => true, 'caption' => true, - 'center' => true, 'col' => true, 'colgroup' => true, 'dd' => true, - 'details' => true, 'dir' => true, 'div' => true, 'dl' => true, - 'dt' => true, 'embed' => true, 'fieldset' => true, - 'figcaption' => true, 'figure' => true, 'footer' => true, - 'form' => true, 'frame' => true, 'frameset' => true, 'h1' => true, - 'h2' => true, 'h3' => true, 'h4' => true, 'h5' => true, - 'h6' => true, 'head' => true, 'header' => true, 'hgroup' => true, - 'hr' => true, 'html' => true, 'iframe' => true, 'img' => true, - 'input' => true, 'li' => true, 'link' => true, - 'listing' => true, 'main' => true, 'marquee' => true, - 'menu' => true, 'meta' => true, 'nav' => true, - 'noembed' => true, 'noframes' => true, 'noscript' => true, - 'object' => true, 'ol' => true, 'p' => true, 'param' => true, - 'plaintext' => true, 'pre' => true, 'script' => true, - 'section' => true, 'select' => true, 'source' => true, - 'style' => true, 'summary' => true, 'table' => true, - 'tbody' => true, 'td' => true, 'template' => true, - 'textarea' => true, 'tfoot' => true, 'th' => true, 'thead' => true, - 'title' => true, 'tr' => true, 'track' => true, 'ul' => true, - 'wbr' => true, 'xmp' => true - ], - self::SVG_NAMESPACE => [ - 'foreignobject' => true, 'desc' => true, 'title' => true - ], - self::MATHML_NAMESPACE => [ - 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true, - 'mtext' => true, 'annotation-xml' => true - ] - ]; - - public static $addressDivPSet = [ - self::HTML_NAMESPACE => [ - 'address' => true, 'div' => true, 'p' => true - ] - ]; - - public static $tableSectionRowSet = [ - self::HTML_NAMESPACE => [ - 'table' => true, 'thead' => true, 'tbody' => true, - 'tfoot' => true, 'tr' => true - ] - ]; - - public static $impliedEndTagsSet = [ - self::HTML_NAMESPACE => [ - 'dd' => true, 'dt' => true, 'li' => true, - 'menuitem' => true, 'optgroup' => true, - 'option' => true, 'p' => true, 'rb' => true, 'rp' => true, - 'rt' => true, 'rtc' => true - ] - ]; - - public static $thoroughImpliedEndTagsSet = [ - self::HTML_NAMESPACE => [ - 'caption' => true, 'colgroup' => true, 'dd' => true, 'dt' => true, - 'li' => true, 'optgroup' => true, 'option' => true, 'p' => true, - 'rb' => true, 'rp' => true, 'rt' => true, 'rtc' => true, - 'tbody' => true, 'td' => true, 'tfoot' => true, 'th' => true, - 'thead' => true, 'tr' => true - ] - ]; - - public static $tableCellSet = [ - self::HTML_NAMESPACE => [ - 'td' => true, 'th' => true - ] - ]; - public static $tableContextSet = [ - self::HTML_NAMESPACE => [ - 'table' => true, 'template' => true, 'html' => true - ] - ]; - - public static $tableBodyContextSet = [ - self::HTML_NAMESPACE => [ - 'tbody' => true, 'tfoot' => true, 'thead' => true, - 'template' => true, 'html' => true - ] - ]; - - public static $tableRowContextSet = [ - self::HTML_NAMESPACE => [ - 'tr' => true, 'template' => true, 'html' => true - ] - ]; - - // See https://html.spec.whatwg.org/multipage/forms.html#form-associated-element - public static $formAssociatedSet = [ - self::HTML_NAMESPACE => [ - 'button' => true, 'fieldset' => true, 'input' => true, - 'keygen' => true, 'object' => true, 'output' => true, - 'select' => true, 'textarea' => true, 'img' => true - ] - ]; - - public static $inScopeSet = [ - self::HTML_NAMESPACE => [ - 'applet' => true, 'caption' => true, 'html' => true, - 'marquee' => true, 'object' => true, - 'table' => true, 'td' => true, 'template' => true, - 'th' => true - ], - self::SVG_NAMESPACE => [ - 'foreignobject' => true, 'desc' => true, 'title' => true - ], - self::MATHML_NAMESPACE => [ - 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true, - 'mtext' => true, 'annotation-xml' => true - ] - ]; - - private static $inListItemScopeSet = null; - public static function inListItemScopeSet() { - if ( self::$inListItemScopeSet === null ) { - self::$inListItemScopeSet = self::$inScopeSet; - self::$inListItemScopeSet[self::HTML_NAMESPACE]['ol'] = true; - self::$inListItemScopeSet[self::HTML_NAMESPACE]['ul'] = true; - } - return self::$inListItemScopeSet; - } - - private static $inButtonScopeSet = null; - public static function inButtonScopeSet() { - if ( self::$inButtonScopeSet === null ) { - self::$inButtonScopeSet = self::$inScopeSet; - self::$inButtonScopeSet[self::HTML_NAMESPACE]['button'] = true; - } - return self::$inButtonScopeSet; - } - - public static $inTableScopeSet = [ - self::HTML_NAMESPACE => [ - 'html' => true, 'table' => true, 'template' => true - ] - ]; - - public static $inInvertedSelectScopeSet = [ - self::HTML_NAMESPACE => [ - 'option' => true, 'optgroup' => true - ] - ]; - - public static $mathmlTextIntegrationPointSet = [ - self::MATHML_NAMESPACE => [ - 'mi' => true, 'mo' => true, 'mn' => true, 'ms' => true, - 'mtext' => true - ] - ]; - - public static $htmlIntegrationPointSet = [ - self::SVG_NAMESPACE => [ - 'foreignobject' => true, - 'desc' => true, - 'title' => true - ] - ]; - - // For tidy compatibility. - public static $tidyPWrapSet = [ - self::HTML_NAMESPACE => [ - 'body' => true, 'blockquote' => true, - // We parse with as the fragment context, but the top-level - // element on the stack is actually . We could use the - // "adjusted current node" everywhere to work around this, but it's - // easier just to add to the p-wrap set. - 'html' => true, - ], - ]; - public static $tidyInlineSet = [ - self::HTML_NAMESPACE => [ - 'a' => true, 'abbr' => true, 'acronym' => true, 'applet' => true, - 'b' => true, 'basefont' => true, 'bdo' => true, 'big' => true, - 'br' => true, 'button' => true, 'cite' => true, 'code' => true, - 'dfn' => true, 'em' => true, 'font' => true, 'i' => true, - 'iframe' => true, 'img' => true, 'input' => true, 'kbd' => true, - 'label' => true, 'legend' => true, 'map' => true, 'object' => true, - 'param' => true, 'q' => true, 'rb' => true, 'rbc' => true, - 'rp' => true, 'rt' => true, 'rtc' => true, 'ruby' => true, - 's' => true, 'samp' => true, 'select' => true, 'small' => true, - 'span' => true, 'strike' => true, 'strong' => true, 'sub' => true, - 'sup' => true, 'textarea' => true, 'tt' => true, 'u' => true, - 'var' => true, - // Those defined in tidy.conf - 'video' => true, 'audio' => true, 'bdi' => true, 'data' => true, - 'time' => true, 'mark' => true, - ], - ]; -} - -/** - * A BalanceElement is a simplified version of a DOM Node. The main - * difference is that we only keep BalanceElements around for nodes - * currently on the BalanceStack of open elements. As soon as an - * element is closed, with some minor exceptions relating to the - * tree builder "adoption agency algorithm", the element and all its - * children are serialized to a string using the flatten() method. - * This keeps our memory usage low. - * - * @ingroup Parser - * @since 1.27 - */ -class BalanceElement { - /** - * The namespace of the element. - * @var string $namespaceURI - */ - public $namespaceURI; - /** - * The lower-cased name of the element. - * @var string $localName - */ - public $localName; - /** - * Attributes for the element, in array form - * @var array $attribs - */ - public $attribs; - - /** - * Parent of this element, or the string "flat" if this element has - * already been flattened into its parent. - * @var BalanceElement|string|null $parent - */ - public $parent; - - /** - * An array of children of this element. Typically only the last - * child will be an actual BalanceElement object; the rest will - * be strings, representing either text nodes or flattened - * BalanceElement objects. - * @var BalanceElement[]|string[] $children - */ - public $children; - - /** - * A unique string identifier for Noah's Ark purposes, lazy initialized - */ - private $noahKey; - - /** - * The next active formatting element in the list, or null if this is the - * end of the AFE list or if the element is not in the AFE list. - */ - public $nextAFE; - - /** - * The previous active formatting element in the list, or null if this is - * the start of the list or if the element is not in the AFE list. - */ - public $prevAFE; - - /** - * The next element in the Noah's Ark species bucket. - */ - public $nextNoah; - - /** - * Make a new BalanceElement corresponding to the HTML DOM Element - * with the given localname, namespace, and attributes. - * - * @param string $namespaceURI The namespace of the element. - * @param string $localName The lowercased name of the tag. - * @param array $attribs Attributes of the element - */ - public function __construct( $namespaceURI, $localName, array $attribs ) { - $this->localName = $localName; - $this->namespaceURI = $namespaceURI; - $this->attribs = $attribs; - $this->contents = ''; - $this->parent = null; - $this->children = []; - } - - /** - * Remove the given child from this element. - * @param BalanceElement $elt - */ - private function removeChild( BalanceElement $elt ) { - Assert::precondition( - $this->parent !== 'flat', "Can't removeChild after flattening $this" - ); - Assert::parameter( - $elt->parent === $this, 'elt', 'must have $this as a parent' - ); - $idx = array_search( $elt, $this->children, true ); - Assert::parameter( $idx !== false, '$elt', 'must be a child of $this' ); - $elt->parent = null; - array_splice( $this->children, $idx, 1 ); - } - - /** - * Find $a in the list of children and insert $b before it. - * @param BalanceElement $a - * @param BalanceElement|string $b - */ - public function insertBefore( BalanceElement $a, $b ) { - Assert::precondition( - $this->parent !== 'flat', "Can't insertBefore after flattening." - ); - $idx = array_search( $a, $this->children, true ); - Assert::parameter( $idx !== false, '$a', 'must be a child of $this' ); - if ( is_string( $b ) ) { - array_splice( $this->children, $idx, 0, [ $b ] ); - } else { - Assert::parameter( $b->parent !== 'flat', '$b', "Can't be flat" ); - if ( $b->parent !== null ) { - $b->parent->removeChild( $b ); - } - array_splice( $this->children, $idx, 0, [ $b ] ); - $b->parent = $this; - } - } - - /** - * Append $elt to the end of the list of children. - * @param BalanceElement|string $elt - */ - public function appendChild( $elt ) { - Assert::precondition( - $this->parent !== 'flat', "Can't appendChild after flattening." - ); - if ( is_string( $elt ) ) { - array_push( $this->children, $elt ); - return; - } - // Remove $elt from parent, if it had one. - if ( $elt->parent !== null ) { - $elt->parent->removeChild( $elt ); - } - array_push( $this->children, $elt ); - $elt->parent = $this; - } - - /** - * Transfer all of the children of $elt to $this. - * @param BalanceElement $elt - */ - public function adoptChildren( BalanceElement $elt ) { - Assert::precondition( - $elt->parent !== 'flat', "Can't adoptChildren after flattening." - ); - foreach ( $elt->children as $child ) { - if ( !is_string( $child ) ) { - // This is an optimization which avoids an O(n^2) set of - // array_splice operations. - $child->parent = null; - } - $this->appendChild( $child ); - } - $elt->children = []; - } - - /** - * Flatten this node and all of its children into a string, as specified - * by the HTML serialization specification, and replace this node - * in its parent by that string. - * - * @param array $config Balancer configuration; see Balancer::__construct(). - * @return string - * - * @see __toString() - */ - public function flatten( array $config ) { - Assert::parameter( $this->parent !== null, '$this', 'must be a child' ); - Assert::parameter( $this->parent !== 'flat', '$this', 'already flat' ); - $idx = array_search( $this, $this->parent->children, true ); - Assert::parameter( - $idx !== false, '$this', 'must be a child of its parent' - ); - $tidyCompat = $config['tidyCompat']; - if ( $tidyCompat ) { - $blank = true; - foreach ( $this->children as $elt ) { - if ( !is_string( $elt ) ) { - $elt = $elt->flatten( $config ); - } - if ( $blank && preg_match( '/[^\t\n\f\r ]/', $elt ) ) { - $blank = false; - } - } - if ( $this->isHtmlNamed( 'mw:p-wrap' ) ) { - $this->localName = 'p'; - } elseif ( $blank ) { - // Add 'mw-empty-elt' class so elements can be hidden via CSS - // for compatibility with legacy tidy. - if ( !count( $this->attribs ) && - ( $this->localName === 'tr' || $this->localName === 'li' ) - ) { - $this->attribs = [ 'class' => "mw-empty-elt" ]; - } - $blank = false; - } elseif ( - $this->isA( BalanceSets::$extraLinefeedSet ) && - count( $this->children ) > 0 && - substr( $this->children[0], 0, 1 ) == "\n" - ) { - // Double the linefeed after pre/listing/textarea - // according to the (old) HTML5 fragment serialization - // algorithm (see https://github.com/whatwg/html/issues/944) - // to ensure this will round-trip. - array_unshift( $this->children, "\n" ); - } - $flat = $blank ? '' : "{$this}"; - } else { - $flat = "{$this}"; - } - $this->parent->children[$idx] = $flat; - $this->parent = 'flat'; // for assertion checking - return $flat; - } - - /** - * Serialize this node and all of its children to a string, as specified - * by the HTML serialization specification. - * - * @return string The serialization of the BalanceElement - * @see https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments - */ - public function __toString() { - $encAttribs = ''; - foreach ( $this->attribs as $name => $value ) { - $encValue = Sanitizer::encodeAttribute( $value ); - $encAttribs .= " $name=\"$encValue\""; - } - if ( !$this->isA( BalanceSets::$emptyElementSet ) ) { - $out = "<{$this->localName}{$encAttribs}>"; - $len = strlen( $out ); - // flatten children - foreach ( $this->children as $elt ) { - $out .= "{$elt}"; - } - $out .= "localName}>"; - } else { - $out = "<{$this->localName}{$encAttribs} />"; - Assert::invariant( - count( $this->children ) === 0, - "Empty elements shouldn't have children." - ); - } - return $out; - } - - // Utility functions on BalanceElements. - - /** - * Determine if $this represents a specific HTML tag, is a member of - * a tag set, or is equal to another BalanceElement. - * - * @param BalanceElement|array|string $set The target BalanceElement, - * set (from the BalanceSets class), or string (HTML tag name). - * @return bool - */ - public function isA( $set ) { - if ( $set instanceof BalanceElement ) { - return $this === $set; - } elseif ( is_array( $set ) ) { - return isset( $set[$this->namespaceURI] ) && - isset( $set[$this->namespaceURI][$this->localName] ); - } else { - // assume this is an HTML element name. - return $this->isHtml() && $this->localName === $set; - } - } - - /** - * Determine if this element is an HTML element with the specified name - * @param string $tagName - * @return bool - */ - public function isHtmlNamed( $tagName ) { - return $this->namespaceURI === BalanceSets::HTML_NAMESPACE - && $this->localName === $tagName; - } - - /** - * Determine if $this represents an element in the HTML namespace. - * - * @return bool - */ - public function isHtml() { - return $this->namespaceURI === BalanceSets::HTML_NAMESPACE; - } - - /** - * Determine if $this represents a MathML text integration point, - * as defined in the HTML5 specification. - * - * @return bool - * @see https://html.spec.whatwg.org/multipage/syntax.html#mathml-text-integration-point - */ - public function isMathmlTextIntegrationPoint() { - return $this->isA( BalanceSets::$mathmlTextIntegrationPointSet ); - } - - /** - * Determine if $this represents an HTML integration point, - * as defined in the HTML5 specification. - * - * @return bool - * @see https://html.spec.whatwg.org/multipage/syntax.html#html-integration-point - */ - public function isHtmlIntegrationPoint() { - if ( $this->isA( BalanceSets::$htmlIntegrationPointSet ) ) { - return true; - } - if ( - $this->namespaceURI === BalanceSets::MATHML_NAMESPACE && - $this->localName === 'annotation-xml' && - isset( $this->attribs['encoding'] ) && - ( strcasecmp( $this->attribs['encoding'], 'text/html' ) == 0 || - strcasecmp( $this->attribs['encoding'], 'application/xhtml+xml' ) == 0 ) - ) { - return true; - } - return false; - } - - /** - * Get a string key for the Noah's Ark algorithm - * @return string - */ - public function getNoahKey() { - if ( $this->noahKey === null ) { - $attribs = $this->attribs; - ksort( $attribs ); - $this->noahKey = serialize( [ $this->namespaceURI, $this->localName, $attribs ] ); - } - return $this->noahKey; - } -} - -/** - * The "stack of open elements" as defined in the HTML5 tree builder - * spec. This contains methods to ensure that content (start tags, text) - * are inserted at the correct place in the output string, and to - * flatten BalanceElements are they are closed to avoid holding onto - * a complete DOM tree for the document in memory. - * - * The stack defines a PHP iterator to traverse it in "reverse order", - * that is, the most-recently-added element is visited first in a - * foreach loop. - * - * @ingroup Parser - * @since 1.27 - * @see https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements - */ -class BalanceStack implements IteratorAggregate { - /** - * Backing storage for the stack. - * @var BalanceElement[] $elements - */ - private $elements = []; - /** - * Foster parent mode determines how nodes are inserted into the - * stack. - * @var bool $fosterParentMode - * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent - */ - public $fosterParentMode = false; - /** - * Configuration options governing flattening. - * @var array $config - * @see Balancer::__construct() - */ - private $config; - /** - * Reference to the current element - */ - public $currentNode; - - /** - * Create a new BalanceStack with a single BalanceElement on it, - * representing the root <html> node. - * @param array $config Balancer configuration; see Balancer::_construct(). - */ - public function __construct( array $config ) { - // always a root element on the stack - array_push( - $this->elements, - new BalanceElement( BalanceSets::HTML_NAMESPACE, 'html', [] ) - ); - $this->currentNode = $this->elements[0]; - $this->config = $config; - } - - /** - * Return a string representing the output of the tree builder: - * all the children of the root <html> node. - * @return string - */ - public function getOutput() { - // Don't include the outer '....' - $out = ''; - foreach ( $this->elements[0]->children as $elt ) { - $out .= is_string( $elt ) ? $elt : - $elt->flatten( $this->config ); - } - return $out; - } - - /** - * Insert a comment at the appropriate place for inserting a node. - * @param string $value Content of the comment. - * @return string - * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-comment - */ - public function insertComment( $value ) { - // Just another type of text node, except for tidy p-wrapping. - return $this->insertText( '', true ); - } - - /** - * Insert text at the appropriate place for inserting a node. - * @param string $value - * @param bool $isComment - * @return string - * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node - */ - public function insertText( $value, $isComment = false ) { - if ( - $this->fosterParentMode && - $this->currentNode->isA( BalanceSets::$tableSectionRowSet ) - ) { - $this->fosterParent( $value ); - } elseif ( - $this->config['tidyCompat'] && !$isComment && - $this->currentNode->isA( BalanceSets::$tidyPWrapSet ) - ) { - $this->insertHTMLElement( 'mw:p-wrap', [] ); - return $this->insertText( $value ); - } else { - $this->currentNode->appendChild( $value ); - } - } - - /** - * Insert a BalanceElement at the appropriate place, pushing it - * on to the open elements stack. - * @param string $namespaceURI The element namespace - * @param string $tag The tag name - * @param string $attribs Normalized attributes, as a string. - * @return BalanceElement - * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-a-foreign-element - */ - public function insertForeignElement( $namespaceURI, $tag, $attribs ) { - return $this->insertElement( - new BalanceElement( $namespaceURI, $tag, $attribs ) - ); - } - - /** - * Insert an HTML element at the appropriate place, pushing it on to - * the open elements stack. - * @param string $tag The tag name - * @param string $attribs Normalized attributes, as a string. - * @return BalanceElement - * @see https://html.spec.whatwg.org/multipage/syntax.html#insert-an-html-element - */ - public function insertHTMLElement( $tag, $attribs ) { - return $this->insertForeignElement( - BalanceSets::HTML_NAMESPACE, $tag, $attribs - ); - } - - /** - * Insert an element at the appropriate place and push it on to the - * open elements stack. - * @param BalanceElement $elt - * @return BalanceElement - * @see https://html.spec.whatwg.org/multipage/syntax.html#appropriate-place-for-inserting-a-node - */ - public function insertElement( BalanceElement $elt ) { - if ( - $this->currentNode->isHtmlNamed( 'mw:p-wrap' ) && - !$elt->isA( BalanceSets::$tidyInlineSet ) - ) { - // Tidy compatibility. - $this->pop(); - } - if ( - $this->fosterParentMode && - $this->currentNode->isA( BalanceSets::$tableSectionRowSet ) - ) { - $elt = $this->fosterParent( $elt ); - } else { - $this->currentNode->appendChild( $elt ); - } - Assert::invariant( $elt->parent !== null, "$elt must be in tree" ); - Assert::invariant( $elt->parent !== 'flat', "$elt must not have been previous flattened" ); - array_push( $this->elements, $elt ); - $this->currentNode = $elt; - return $elt; - } - - /** - * Determine if the stack has $tag in scope. - * @param BalanceElement|array|string $tag - * @return bool - * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-scope - */ - public function inScope( $tag ) { - return $this->inSpecificScope( $tag, BalanceSets::$inScopeSet ); - } - - /** - * Determine if the stack has $tag in button scope. - * @param BalanceElement|array|string $tag - * @return bool - * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-button-scope - */ - public function inButtonScope( $tag ) { - return $this->inSpecificScope( $tag, BalanceSets::inButtonScopeSet() ); - } - - /** - * Determine if the stack has $tag in list item scope. - * @param BalanceElement|array|string $tag - * @return bool - * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-list-item-scope - */ - public function inListItemScope( $tag ) { - return $this->inSpecificScope( $tag, BalanceSets::inListItemScopeSet() ); - } - - /** - * Determine if the stack has $tag in table scope. - * @param BalanceElement|array|string $tag - * @return bool - * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-table-scope - */ - public function inTableScope( $tag ) { - return $this->inSpecificScope( $tag, BalanceSets::$inTableScopeSet ); - } - - /** - * Determine if the stack has $tag in select scope. - * @param BalanceElement|array|string $tag - * @return bool - * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-select-scope - */ - public function inSelectScope( $tag ) { - // Can't use inSpecificScope to implement this, since it involves - // *inverting* a set of tags. Implement manually. - foreach ( $this as $elt ) { - if ( $elt->isA( $tag ) ) { - return true; - } - if ( !$elt->isA( BalanceSets::$inInvertedSelectScopeSet ) ) { - return false; - } - } - return false; - } - - /** - * Determine if the stack has $tag in a specific scope, $set. - * @param BalanceElement|array|string $tag - * @param BalanceElement|array|string $set - * @return bool - * @see https://html.spec.whatwg.org/multipage/syntax.html#has-an-element-in-the-specific-scope - */ - public function inSpecificScope( $tag, $set ) { - foreach ( $this as $elt ) { - if ( $elt->isA( $tag ) ) { - return true; - } - if ( $elt->isA( $set ) ) { - return false; - } - } - return false; - } - - /** - * Generate implied end tags. - * @param string $butnot - * @param bool $thorough True if we should generate end tags thoroughly. - * @see https://html.spec.whatwg.org/multipage/syntax.html#generate-implied-end-tags - */ - public function generateImpliedEndTags( $butnot = null, $thorough = false ) { - $endTagSet = $thorough ? - BalanceSets::$thoroughImpliedEndTagsSet : - BalanceSets::$impliedEndTagsSet; - while ( $this->currentNode ) { - if ( $butnot !== null && $this->currentNode->isHtmlNamed( $butnot ) ) { - break; - } - if ( !$this->currentNode->isA( $endTagSet ) ) { - break; - } - $this->pop(); - } - } - - /** - * Return the adjusted current node. - * @param string $fragmentContext - * @return string - */ - public function adjustedCurrentNode( $fragmentContext ) { - return ( $fragmentContext && count( $this->elements ) === 1 ) ? - $fragmentContext : $this->currentNode; - } - - /** - * Return an iterator over this stack which visits the current node - * first, and the root node last. - * @return \Iterator - */ - public function getIterator() { - return new ReverseArrayIterator( $this->elements ); - } - - /** - * Return the BalanceElement at the given position $idx, where - * position 0 represents the root element. - * @param int $idx - * @return BalanceElement - */ - public function node( $idx ) { - return $this->elements[ $idx ]; - } - - /** - * Replace the element at position $idx in the BalanceStack with $elt. - * @param int $idx - * @param BalanceElement $elt - */ - public function replaceAt( $idx, BalanceElement $elt ) { - Assert::precondition( - $this->elements[$idx]->parent !== 'flat', - 'Replaced element should not have already been flattened.' - ); - Assert::precondition( - $elt->parent !== 'flat', - 'New element should not have already been flattened.' - ); - $this->elements[$idx] = $elt; - if ( $idx === count( $this->elements ) - 1 ) { - $this->currentNode = $elt; - } - } - - /** - * Return the position of the given BalanceElement, set, or - * HTML tag name string in the BalanceStack. - * @param BalanceElement|array|string $tag - * @return int - */ - public function indexOf( $tag ) { - for ( $i = count( $this->elements ) - 1; $i >= 0; $i-- ) { - if ( $this->elements[$i]->isA( $tag ) ) { - return $i; - } - } - return -1; - } - - /** - * Return the number of elements currently in the BalanceStack. - * @return int - */ - public function length() { - return count( $this->elements ); - } - - /** - * Remove the current node from the BalanceStack, flattening it - * in the process. - */ - public function pop() { - $elt = array_pop( $this->elements ); - if ( count( $this->elements ) ) { - $this->currentNode = $this->elements[ count( $this->elements ) - 1 ]; - } else { - $this->currentNode = null; - } - if ( !$elt->isHtmlNamed( 'mw:p-wrap' ) ) { - $elt->flatten( $this->config ); - } - } - - /** - * Remove all nodes up to and including position $idx from the - * BalanceStack, flattening them in the process. - * @param int $idx - */ - public function popTo( $idx ) { - for ( $length = count( $this->elements ); $length > $idx; $length-- ) { - $this->pop(); - } - } - - /** - * Pop elements off the stack up to and including the first - * element with the specified HTML tagname (or matching the given - * set). - * @param BalanceElement|array|string $tag - */ - public function popTag( $tag ) { - while ( $this->currentNode ) { - if ( $this->currentNode->isA( $tag ) ) { - $this->pop(); - break; - } - $this->pop(); - } - } - - /** - * Pop elements off the stack *not including* the first element - * in the specified set. - * @param BalanceElement|array|string $set - */ - public function clearToContext( $set ) { - // Note that we don't loop to 0. Never pop the elt off. - for ( $length = count( $this->elements ); $length > 1; $length-- ) { - if ( $this->currentNode->isA( $set ) ) { - break; - } - $this->pop(); - } - } - - /** - * Remove the given $elt from the BalanceStack, optionally - * flattening it in the process. - * @param BalanceElement $elt The element to remove. - * @param bool $flatten Whether to flatten the removed element. - */ - public function removeElement( BalanceElement $elt, $flatten = true ) { - Assert::parameter( - $elt->parent !== 'flat', - '$elt', - '$elt should not already have been flattened.' - ); - Assert::parameter( - $elt->parent->parent !== 'flat', - '$elt', - 'The parent of $elt should not already have been flattened.' - ); - $idx = array_search( $elt, $this->elements, true ); - Assert::parameter( $idx !== false, '$elt', 'must be in stack' ); - array_splice( $this->elements, $idx, 1 ); - if ( $idx === count( $this->elements ) ) { - $this->currentNode = $this->elements[$idx - 1]; - } - if ( $flatten ) { - // serialize $elt into its parent - // otherwise, it will eventually serialize when the parent - // is serialized, we just hold onto the memory for its - // tree of objects a little longer. - $elt->flatten( $this->config ); - } - Assert::postcondition( - array_search( $elt, $this->elements, true ) === false, - '$elt should no longer be in open elements stack' - ); - } - - /** - * Find $a in the BalanceStack and insert $b after it. - * @param BalanceElement $a - * @param BalanceElement $b - */ - public function insertAfter( BalanceElement $a, BalanceElement $b ) { - $idx = $this->indexOf( $a ); - Assert::parameter( $idx !== false, '$a', 'must be in stack' ); - if ( $idx === count( $this->elements ) - 1 ) { - array_push( $this->elements, $b ); - $this->currentNode = $b; - } else { - array_splice( $this->elements, $idx + 1, 0, [ $b ] ); - } - } - - // Fostering and adoption. - - /** - * Foster parent the given $elt in the stack of open elements. - * @param BalanceElement|string $elt - * @return BalanceElement|string - * - * @see https://html.spec.whatwg.org/multipage/syntax.html#foster-parent - */ - private function fosterParent( $elt ) { - $lastTable = $this->indexOf( 'table' ); - $lastTemplate = $this->indexOf( 'template' ); - $parent = null; - $before = null; - - if ( $lastTemplate >= 0 && ( $lastTable < 0 || $lastTemplate > $lastTable ) ) { - $parent = $this->elements[$lastTemplate]; - } elseif ( $lastTable >= 0 ) { - $parent = $this->elements[$lastTable]->parent; - // Assume all tables have parents, since we're not running scripts! - Assert::invariant( - $parent !== null, "All tables should have parents" - ); - $before = $this->elements[$lastTable]; - } else { - $parent = $this->elements[0]; // the `html` element. - } - - if ( $this->config['tidyCompat'] ) { - if ( is_string( $elt ) ) { - // We're fostering text: do we need a p-wrapper? - if ( $parent->isA( BalanceSets::$tidyPWrapSet ) ) { - $this->insertHTMLElement( 'mw:p-wrap', [] ); - $this->insertText( $elt ); - return $elt; - } - } else { - // We're fostering an element; do we need to merge p-wrappers? - if ( $elt->isHtmlNamed( 'mw:p-wrap' ) ) { - $idx = $before ? - array_search( $before, $parent->children, true ) : - count( $parent->children ); - $after = $idx > 0 ? $parent->children[$idx - 1] : ''; - if ( - $after instanceof BalanceElement && - $after->isHtmlNamed( 'mw:p-wrap' ) - ) { - return $after; // Re-use existing p-wrapper. - } - } - } - } - - if ( $before ) { - $parent->insertBefore( $before, $elt ); - } else { - $parent->appendChild( $elt ); - } - return $elt; - } - - /** - * Run the "adoption agency algoritm" (AAA) for the given subject - * tag name. - * @param string $tag The subject tag name. - * @param BalanceActiveFormattingElements $afe The current - * active formatting elements list. - * @return true if the adoption agency algorithm "did something", false - * if more processing is required by the caller. - * @see https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm - */ - public function adoptionAgency( $tag, $afe ) { - // If the current node is an HTML element whose tag name is subject, - // and the current node is not in the list of active formatting - // elements, then pop the current node off the stack of open - // elements and abort these steps. - if ( - $this->currentNode->isHtmlNamed( $tag ) && - !$afe->isInList( $this->currentNode ) - ) { - $this->pop(); - return true; // no more handling required - } - - // Outer loop: If outer loop counter is greater than or - // equal to eight, then abort these steps. - for ( $outer = 0; $outer < 8; $outer++ ) { - // Let the formatting element be the last element in the list - // of active formatting elements that: is between the end of - // the list and the last scope marker in the list, if any, or - // the start of the list otherwise, and has the same tag name - // as the token. - $fmtElt = $afe->findElementByTag( $tag ); - - // If there is no such node, then abort these steps and instead - // act as described in the "any other end tag" entry below. - if ( !$fmtElt ) { - return false; // false means handle by the default case - } - - // Otherwise, if there is such a node, but that node is not in - // the stack of open elements, then this is a parse error; - // remove the element from the list, and abort these steps. - $index = $this->indexOf( $fmtElt ); - if ( $index < 0 ) { - $afe->remove( $fmtElt ); - return true; // true means no more handling required - } - - // Otherwise, if there is such a node, and that node is also in - // the stack of open elements, but the element is not in scope, - // then this is a parse error; ignore the token, and abort - // these steps. - if ( !$this->inScope( $fmtElt ) ) { - return true; - } - - // Let the furthest block be the topmost node in the stack of - // open elements that is lower in the stack than the formatting - // element, and is an element in the special category. There - // might not be one. - $furthestBlock = null; - $furthestBlockIndex = -1; - $stackLength = $this->length(); - for ( $i = $index + 1; $i < $stackLength; $i++ ) { - if ( $this->node( $i )->isA( BalanceSets::$specialSet ) ) { - $furthestBlock = $this->node( $i ); - $furthestBlockIndex = $i; - break; - } - } - - // If there is no furthest block, then the UA must skip the - // subsequent steps and instead just pop all the nodes from the - // bottom of the stack of open elements, from the current node - // up to and including the formatting element, and remove the - // formatting element from the list of active formatting - // elements. - if ( !$furthestBlock ) { - $this->popTag( $fmtElt ); - $afe->remove( $fmtElt ); - return true; - } - - // Let the common ancestor be the element immediately above - // the formatting element in the stack of open elements. - $ancestor = $this->node( $index - 1 ); - - // Let a bookmark note the position of the formatting - // element in the list of active formatting elements - // relative to the elements on either side of it in the - // list. - $BOOKMARK = new BalanceElement( '[bookmark]', '[bookmark]', [] ); - $afe->insertAfter( $fmtElt, $BOOKMARK ); - - // Let node and last node be the furthest block. - $node = $furthestBlock; - $lastNode = $furthestBlock; - $nodeIndex = $furthestBlockIndex; - $isAFE = false; - - // Inner loop - for ( $inner = 1; true; $inner++ ) { - // Let node be the element immediately above node in - // the stack of open elements, or if node is no longer - // in the stack of open elements (e.g. because it got - // removed by this algorithm), the element that was - // immediately above node in the stack of open elements - // before node was removed. - $node = $this->node( --$nodeIndex ); - - // If node is the formatting element, then go - // to the next step in the overall algorithm. - if ( $node === $fmtElt ) break; - - // If the inner loop counter is greater than three and node - // is in the list of active formatting elements, then remove - // node from the list of active formatting elements. - $isAFE = $afe->isInList( $node ); - if ( $inner > 3 && $isAFE ) { - $afe->remove( $node ); - $isAFE = false; - } - - // If node is not in the list of active formatting - // elements, then remove node from the stack of open - // elements and then go back to the step labeled inner - // loop. - if ( !$isAFE ) { - // Don't flatten here, since we're about to relocate - // parts of this $node. - $this->removeElement( $node, false ); - continue; - } - - // Create an element for the token for which the - // element node was created with common ancestor as - // the intended parent, replace the entry for node - // in the list of active formatting elements with an - // entry for the new element, replace the entry for - // node in the stack of open elements with an entry for - // the new element, and let node be the new element. - $newElt = new BalanceElement( - $node->namespaceURI, $node->localName, $node->attribs ); - $afe->replace( $node, $newElt ); - $this->replaceAt( $nodeIndex, $newElt ); - $node = $newElt; - - // If last node is the furthest block, then move the - // aforementioned bookmark to be immediately after the - // new node in the list of active formatting elements. - if ( $lastNode === $furthestBlock ) { - $afe->remove( $BOOKMARK ); - $afe->insertAfter( $newElt, $BOOKMARK ); - } - - // Insert last node into node, first removing it from - // its previous parent node if any. - $node->appendChild( $lastNode ); - - // Let last node be node. - $lastNode = $node; - } - - // If the common ancestor node is a table, tbody, tfoot, - // thead, or tr element, then, foster parent whatever last - // node ended up being in the previous step, first removing - // it from its previous parent node if any. - if ( - $this->fosterParentMode && - $ancestor->isA( BalanceSets::$tableSectionRowSet ) - ) { - $this->fosterParent( $lastNode ); - } else { - // Otherwise, append whatever last node ended up being in - // the previous step to the common ancestor node, first - // removing it from its previous parent node if any. - $ancestor->appendChild( $lastNode ); - } - - // Create an element for the token for which the - // formatting element was created, with furthest block - // as the intended parent. - $newElt2 = new BalanceElement( - $fmtElt->namespaceURI, $fmtElt->localName, $fmtElt->attribs ); - - // Take all of the child nodes of the furthest block and - // append them to the element created in the last step. - $newElt2->adoptChildren( $furthestBlock ); - - // Append that new element to the furthest block. - $furthestBlock->appendChild( $newElt2 ); - - // Remove the formatting element from the list of active - // formatting elements, and insert the new element into the - // list of active formatting elements at the position of - // the aforementioned bookmark. - $afe->remove( $fmtElt ); - $afe->replace( $BOOKMARK, $newElt2 ); - - // Remove the formatting element from the stack of open - // elements, and insert the new element into the stack of - // open elements immediately below the position of the - // furthest block in that stack. - $this->removeElement( $fmtElt ); - $this->insertAfter( $furthestBlock, $newElt2 ); - } - - return true; - } - - /** - * Return the contents of the open elements stack as a string for - * debugging. - * @return string - */ - public function __toString() { - $r = []; - foreach ( $this->elements as $elt ) { - array_push( $r, $elt->localName ); - } - return implode( ' ', $r ); - } -} - -/** - * A pseudo-element used as a marker in the list of active formatting elements - * - * @ingroup Parser - * @since 1.27 - */ -class BalanceMarker { - public $nextAFE; - public $prevAFE; -} - -/** - * The list of active formatting elements, which is used to handle - * mis-nested formatting element tags in the HTML5 tree builder - * specification. - * - * @ingroup Parser - * @since 1.27 - * @see https://html.spec.whatwg.org/multipage/syntax.html#list-of-active-formatting-elements - */ -class BalanceActiveFormattingElements { - /** The last (most recent) element in the list */ - private $tail; - - /** The first (least recent) element in the list */ - private $head; - - /** - * An array of arrays representing the population of elements in each bucket - * according to the Noah's Ark clause. The outer array is stack-like, with each - * integer-indexed element representing a segment of the list, bounded by - * markers. The first element represents the segment of the list before the - * first marker. - * - * The inner arrays are indexed by "Noah key", which is a string which uniquely - * identifies each bucket according to the rules in the spec. The value in - * the inner array is the first (least recently inserted) element in the bucket, - * and subsequent members of the bucket can be found by iterating through the - * singly-linked list via $node->nextNoah. - * - * This is optimised for the most common case of inserting into a bucket - * with zero members, and deleting a bucket containing one member. In the - * worst case, iteration through the list is still O(1) in the document - * size, since each bucket can have at most 3 members. - */ - private $noahTableStack = [ [] ]; - - public function __destruct() { - $next = null; - for ( $node = $this->head; $node; $node = $next ) { - $next = $node->nextAFE; - $node->prevAFE = $node->nextAFE = $node->nextNoah = null; - } - $this->head = $this->tail = $this->noahTableStack = null; - } - - public function insertMarker() { - $elt = new BalanceMarker; - if ( $this->tail ) { - $this->tail->nextAFE = $elt; - $elt->prevAFE = $this->tail; - } else { - $this->head = $elt; - } - $this->tail = $elt; - $this->noahTableStack[] = []; - } - - /** - * Follow the steps required when the spec requires us to "push onto the - * list of active formatting elements". - * @param BalanceElement $elt - */ - public function push( BalanceElement $elt ) { - // Must not be in the list already - if ( $elt->prevAFE !== null || $this->head === $elt ) { - throw new ParameterAssertionException( '$elt', - 'Cannot insert a node into the AFE list twice' ); - } - - // "Noah's Ark clause" -- if there are already three copies of - // this element before we encounter a marker, then drop the last - // one. - $noahKey = $elt->getNoahKey(); - $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ]; - if ( !isset( $table[$noahKey] ) ) { - $table[$noahKey] = $elt; - } else { - $count = 1; - $head = $tail = $table[$noahKey]; - while ( $tail->nextNoah ) { - $tail = $tail->nextNoah; - $count++; - } - if ( $count >= 3 ) { - $this->remove( $head ); - } - $tail->nextNoah = $elt; - } - // Add to the main AFE list - if ( $this->tail ) { - $this->tail->nextAFE = $elt; - $elt->prevAFE = $this->tail; - } else { - $this->head = $elt; - } - $this->tail = $elt; - } - - /** - * Follow the steps required when the spec asks us to "clear the list of - * active formatting elements up to the last marker". - */ - public function clearToMarker() { - // Iterate back through the list starting from the tail - $tail = $this->tail; - while ( $tail && !( $tail instanceof BalanceMarker ) ) { - // Unlink the element - $prev = $tail->prevAFE; - $tail->prevAFE = null; - if ( $prev ) { - $prev->nextAFE = null; - } - $tail->nextNoah = null; - $tail = $prev; - } - // If we finished on a marker, unlink it and pop it off the Noah table stack - if ( $tail ) { - $prev = $tail->prevAFE; - if ( $prev ) { - $prev->nextAFE = null; - } - $tail = $prev; - array_pop( $this->noahTableStack ); - } else { - // No marker: wipe the top-level Noah table (which is the only one) - $this->noahTableStack[0] = []; - } - // If we removed all the elements, clear the head pointer - if ( !$tail ) { - $this->head = null; - } - $this->tail = $tail; - } - - /** - * Find and return the last element with the specified tag between the - * end of the list and the last marker on the list. - * Used when parsing <a> "in body mode". - * @param string $tag - * @return null|Node - */ - public function findElementByTag( $tag ) { - $elt = $this->tail; - while ( $elt && !( $elt instanceof BalanceMarker ) ) { - if ( $elt->localName === $tag ) { - return $elt; - } - $elt = $elt->prevAFE; - } - return null; - } - - /** - * Determine whether an element is in the list of formatting elements. - * @param BalanceElement $elt - * @return bool - */ - public function isInList( BalanceElement $elt ) { - return $this->head === $elt || $elt->prevAFE; - } - - /** - * Find the element $elt in the list and remove it. - * Used when parsing <a> in body mode. - * - * @param BalanceElement $elt - */ - public function remove( BalanceElement $elt ) { - if ( $this->head !== $elt && !$elt->prevAFE ) { - throw new ParameterAssertionException( '$elt', - "Attempted to remove an element which is not in the AFE list" ); - } - // Update head and tail pointers - if ( $this->head === $elt ) { - $this->head = $elt->nextAFE; - } - if ( $this->tail === $elt ) { - $this->tail = $elt->prevAFE; - } - // Update previous element - if ( $elt->prevAFE ) { - $elt->prevAFE->nextAFE = $elt->nextAFE; - } - // Update next element - if ( $elt->nextAFE ) { - $elt->nextAFE->prevAFE = $elt->prevAFE; - } - // Clear pointers so that isInList() etc. will work - $elt->prevAFE = $elt->nextAFE = null; - // Update Noah list - $this->removeFromNoahList( $elt ); - } - - private function addToNoahList( BalanceElement $elt ) { - $noahKey = $elt->getNoahKey(); - $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ]; - if ( !isset( $table[$noahKey] ) ) { - $table[$noahKey] = $elt; - } else { - $tail = $table[$noahKey]; - while ( $tail->nextNoah ) { - $tail = $tail->nextNoah; - } - $tail->nextNoah = $elt; - } - } - - private function removeFromNoahList( BalanceElement $elt ) { - $table =& $this->noahTableStack[ count( $this->noahTableStack ) - 1 ]; - $key = $elt->getNoahKey(); - $noahElt = $table[$key]; - if ( $noahElt === $elt ) { - if ( $noahElt->nextNoah ) { - $table[$key] = $noahElt->nextNoah; - $noahElt->nextNoah = null; - } else { - unset( $table[$key] ); - } - } else { - do { - $prevNoahElt = $noahElt; - $noahElt = $prevNoahElt->nextNoah; - if ( $noahElt === $elt ) { - // Found it, unlink - $prevNoahElt->nextNoah = $elt->nextNoah; - $elt->nextNoah = null; - break; - } - } while ( $noahElt ); - } - } - - /** - * Find element $a in the list and replace it with element $b - * - * @param BalanceElement $a - * @param BalanceElement $b - */ - public function replace( BalanceElement $a, BalanceElement $b ) { - if ( $this->head !== $a && !$a->prevAFE ) { - throw new ParameterAssertionException( '$a', - "Attempted to replace an element which is not in the AFE list" ); - } - // Update head and tail pointers - if ( $this->head === $a ) { - $this->head = $b; - } - if ( $this->tail === $a ) { - $this->tail = $b; - } - // Update previous element - if ( $a->prevAFE ) { - $a->prevAFE->nextAFE = $b; - } - // Update next element - if ( $a->nextAFE ) { - $a->nextAFE->prevAFE = $b; - } - $b->prevAFE = $a->prevAFE; - $b->nextAFE = $a->nextAFE; - $a->nextAFE = $a->prevAFE = null; - // Update Noah list - $this->removeFromNoahList( $a ); - $this->addToNoahList( $b ); - } - - /** - * Find $a in the list and insert $b after it. - - * @param BalanceElement $a - * @param BalanceElement $b - */ - public function insertAfter( BalanceElement $a, BalanceElement $b ) { - if ( $this->head !== $a && !$a->prevAFE ) { - throw new ParameterAssertionException( '$a', - "Attempted to insert after an element which is not in the AFE list" ); - } - if ( $this->tail === $a ) { - $this->tail = $b; - } - if ( $a->nextAFE ) { - $a->nextAFE->prevAFE = $b; - } - $b->nextAFE = $a->nextAFE; - $b->prevAFE = $a; - $a->nextAFE = $b; - $this->addToNoahList( $b ); - } - - /** - * Reconstruct the active formatting elements. - * @param BalanceStack $stack The open elements stack - * @see https://html.spec.whatwg.org/multipage/syntax.html#reconstruct-the-active-formatting-elements - */ - public function reconstruct( $stack ) { - $entry = $this->tail; - // If there are no entries in the list of active formatting elements, - // then there is nothing to reconstruct - if ( !$entry ) { - return; - } - // If the last is a marker, do nothing. - if ( $entry instanceof BalanceMarker ) { - return; - } - // Or if it is an open element, do nothing. - if ( $stack->indexOf( $entry ) >= 0 ) { - return; - } - - // Loop backward through the list until we find a marker or an - // open element - $foundIt = false; - while ( $entry->prevAFE ) { - $entry = $entry->prevAFE; - if ( $entry instanceof BalanceMarker || $stack->indexOf( $entry ) >= 0 ) { - $foundIt = true; - break; - } - } - - // Now loop forward, starting from the element after the current one (or - // the first element if we didn't find a marker or open element), - // recreating formatting elements and pushing them back onto the list - // of open elements. - if ( $foundIt ) { - $entry = $entry->nextAFE; - } - do { - $newElement = $stack->insertHTMLElement( - $entry->localName, - $entry->attribs ); - $this->replace( $entry, $newElement ); - $entry = $newElement->nextAFE; - } while ( $entry ); - } - - /** - * Get a string representation of the AFE list, for debugging - */ - public function __toString() { - $prev = null; - $s = ''; - for ( $node = $this->head; $node; $prev = $node, $node = $node->nextAFE ) { - if ( $node instanceof BalanceMarker ) { - $s .= "MARKER\n"; - continue; - } - $s .= $node->localName . '#' . substr( md5( spl_object_hash( $node ) ), 0, 8 ); - if ( $node->nextNoah ) { - $s .= " (noah sibling: {$node->nextNoah->localName}#" . - substr( md5( spl_object_hash( $node->nextNoah ) ), 0, 8 ) . - ')'; - } - if ( $node->nextAFE && $node->nextAFE->prevAFE !== $node ) { - $s .= " (reverse link is wrong!)"; - } - $s .= "\n"; - } - if ( $prev !== $this->tail ) { - $s .= "(tail pointer is wrong!)\n"; - } - return $s; - } -} - -/** - * An implementation of the tree building portion of the HTML5 parsing - * spec. - * - * This is used to balance and tidy output so that the result can - * always be cleanly serialized/deserialized by an HTML5 parser. It - * does *not* guarantee "conforming" output -- the HTML5 spec contains - * a number of constraints which are not enforced by the HTML5 parsing - * process. But the result will be free of gross errors: misnested or - * unclosed tags, for example, and will be unchanged by spec-complient - * parsing followed by serialization. - * - * The tree building stage is structured as a state machine. - * When comparing the implementation to - * https://www.w3.org/TR/html5/syntax.html#tree-construction - * note that each state is implemented as a function with a - * name ending in `Mode` (because the HTML spec refers to them - * as insertion modes). The current insertion mode is held by - * the $parseMode property. - * - * The following simplifications have been made: - * - We handle body content only (ie, we start `in body`.) - * - The document is never in "quirks mode". - * - All occurrences of < and > have been entity escaped, so we - * can parse tags by simply splitting on those two characters. - * (This also simplifies the handling of < inside ", - "
\n\na
\n\nb", - true # use the tidy-compatible mode - ]; - - return $tests; - } -}