From ff4a3287db7d7681f10fb9c289cd9b84bed5eed6 Mon Sep 17 00:00:00 2001 From: Max Semenik Date: Tue, 12 Apr 2016 13:20:20 -0700 Subject: [PATCH] Switch to external HtmlFormatter wikimedia/html-formatter is already present in mediawiki/vendor as of 3954ca36ce3cbedc76c1763ad2694320c1327ce6. Bug: T125001 Change-Id: Ie98096e5e3d325cde583bc66b21b8b41f2bba8b2 --- composer.json | 1 + includes/HtmlFormatter.php | 349 +------------------ includes/api/ApiHelp.php | 2 + tests/phpunit/includes/HtmlFormatterTest.php | 141 -------- 4 files changed, 7 insertions(+), 486 deletions(-) delete mode 100644 tests/phpunit/includes/HtmlFormatterTest.php diff --git a/composer.json b/composer.json index 1378e96ce3..6b7898ddd1 100644 --- a/composer.json +++ b/composer.json @@ -34,6 +34,7 @@ "wikimedia/cdb": "1.3.0", "wikimedia/cldr-plural-rule-parser": "1.0.0", "wikimedia/composer-merge-plugin": "1.3.1", + "wikimedia/html-formatter": "1.0.1", "wikimedia/ip-set": "1.0.1", "wikimedia/php-session-serializer": "1.0.3", "wikimedia/relpath": "1.0.3", diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php index 206f0f7309..9bae8b5fef 100644 --- a/includes/HtmlFormatter.php +++ b/includes/HtmlFormatter.php @@ -1,7 +1,7 @@ html = $html; - } - - /** - * Turns a chunk of HTML into a proper document - * @param string $html - * @return string - */ - public static function wrapHTML( $html ) { - return '' . $html . ''; - } - - /** - * Override this in descendant class to modify HTML after it has been converted from DOM tree - * @param string $html HTML to process - * @return string Processed HTML - */ - protected function onHtmlReady( $html ) { - return $html; - } - - /** - * @return DOMDocument DOM to manipulate - */ - public function getDoc() { - if ( !$this->doc ) { - // DOMDocument::loadHTML isn't very good with encodings, so - // convert input to ASCII by encoding everything above 128 as entities. - $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); - - // Workaround for bug that caused spaces before references - // to disappear during processing: https://phabricator.wikimedia.org/T55086 - // TODO: Please replace with a better fix if one can be found. - $html = str_replace( ' <', ' <', $html ); - - libxml_use_internal_errors( true ); - $loader = libxml_disable_entity_loader(); - $this->doc = new DOMDocument(); - $this->doc->strictErrorChecking = false; - $this->doc->loadHTML( $html ); - libxml_disable_entity_loader( $loader ); - libxml_use_internal_errors( false ); - $this->doc->encoding = 'UTF-8'; - } - return $this->doc; - } - - /** - * Sets whether images/videos/sounds should be removed from output - * @param bool $flag - */ - public function setRemoveMedia( $flag = true ) { - $this->removeMedia = $flag; - } - - /** - * Adds one or more selector of content to remove. A subset of CSS selector - * syntax is supported: - * - * - * .class - * . - * # - * - * @param array|string $selectors Selector(s) of stuff to remove - */ - public function remove( $selectors ) { - $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); - } - - /** - * Adds one or more element name to the list to flatten (remove tag, but not its content) - * Can accept undelimited regexes - * - * Note this interface may fail in surprising unexpected ways due to usage of regexes, - * so should not be relied on for HTML markup security measures. - * - * @param array|string $elements Name(s) of tag(s) to flatten - */ - public function flatten( $elements ) { - $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); - } - - /** - * Instructs the formatter to flatten all tags - */ - public function flattenAllTags() { - $this->flatten( '[?!]?[a-z0-9]+' ); - } - - /** - * Removes content we've chosen to remove. The text of the removed elements can be - * extracted with the getText method. - * @return array Array of removed DOMElements - */ - public function filterContent() { - $removals = $this->parseItemsToRemove(); - - // Bail out early if nothing to do - if ( array_reduce( $removals, - function ( $carry, $item ) { - return $carry && !$item; - }, - true - ) ) { - return []; - } - - $doc = $this->getDoc(); - - // Remove tags - - // You can't remove DOMNodes from a DOMNodeList as you're iterating - // over them in a foreach loop. It will seemingly leave the internal - // iterator on the foreach out of wack and results will be quite - // strange. Though, making a queue of items to remove seems to work. - $domElemsToRemove = []; - foreach ( $removals['TAG'] as $tagToRemove ) { - $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); - foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { - if ( $tagToRemoveNode ) { - $domElemsToRemove[] = $tagToRemoveNode; - } - } - } - $removed = $this->removeElements( $domElemsToRemove ); - - // Elements with named IDs - $domElemsToRemove = []; - foreach ( $removals['ID'] as $itemToRemove ) { - $itemToRemoveNode = $doc->getElementById( $itemToRemove ); - if ( $itemToRemoveNode ) { - $domElemsToRemove[] = $itemToRemoveNode; - } - } - $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); - - // CSS Classes - $domElemsToRemove = []; - $xpath = new DOMXPath( $doc ); - foreach ( $removals['CLASS'] as $classToRemove ) { - $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); - - /** @var $element DOMElement */ - foreach ( $elements as $element ) { - $classes = $element->getAttribute( 'class' ); - if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { - $domElemsToRemove[] = $element; - } - } - } - $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); - - // Tags with CSS Classes - foreach ( $removals['TAG_CLASS'] as $classToRemove ) { - $parts = explode( '.', $classToRemove ); - - $elements = $xpath->query( - '//' . $parts[0] . '[@class="' . $parts[1] . '"]' - ); - $removed = array_merge( $removed, $this->removeElements( $elements ) ); - } - - return $removed; - } - - /** - * Removes a list of elelments from DOMDocument - * @param array|DOMNodeList $elements - * @return array Array of removed elements - */ - private function removeElements( $elements ) { - $list = $elements; - if ( $elements instanceof DOMNodeList ) { - $list = []; - foreach ( $elements as $element ) { - $list[] = $element; - } - } - /** @var $element DOMElement */ - foreach ( $list as $element ) { - if ( $element->parentNode ) { - $element->parentNode->removeChild( $element ); - } - } - return $list; - } - - /** - * libxml in its usual pointlessness converts many chars to entities - this function - * perfoms a reverse conversion - * @param string $html - * @return string - */ - private function fixLibXML( $html ) { - static $replacements; - if ( !$replacements ) { - // We don't include rules like '"' => '"' because entities had already been - // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! - $replacements = new ReplacementArray( [ - '"' => '"', - '&' => '&', - '<' => '<', - '>' => '>', - ] ); - } - $html = $replacements->replace( $html ); - - // Just in case the conversion in getDoc() above used named - // entities that aren't known to html_entity_decode(). - $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); - - return $html; - } - - /** - * Performs final transformations and returns resulting HTML. Note that if you want to call this - * both without an element and with an element you should call it without an element first. If you - * specify the $element in the method it'll change the underlying dom and you won't be able to get - * it back. - * - * @param DOMElement|string|null $element ID of element to get HTML from or - * false to get it from the whole tree - * @return string Processed HTML - */ - public function getText( $element = null ) { - - if ( $this->doc ) { - if ( $element !== null && !( $element instanceof DOMElement ) ) { - $element = $this->doc->getElementById( $element ); - } - if ( $element ) { - $body = $this->doc->getElementsByTagName( 'body' )->item( 0 ); - $nodesArray = []; - foreach ( $body->childNodes as $node ) { - $nodesArray[] = $node; - } - foreach ( $nodesArray as $nodeArray ) { - $body->removeChild( $nodeArray ); - } - $body->appendChild( $element ); - } - $html = $this->doc->saveHTML(); - - $html = $this->fixLibXML( $html ); - if ( wfIsWindows() ) { - // Cleanup for CRLF misprocessing of unknown origin on Windows. - // If this error continues in the future, please track it down in the - // XML code paths if possible and fix there. - $html = str_replace( ' ', '', $html ); - } - } else { - $html = $this->html; - } - // Remove stuff added by wrapHTML() - $html = preg_replace( '/|^.*?|<\/body>.*$/s', '', $html ); - $html = $this->onHtmlReady( $html ); - - if ( $this->elementsToFlatten ) { - $elements = implode( '|', $this->elementsToFlatten ); - $html = preg_replace( "#]*>#is", '', $html ); - } - - return $html; - } - - /** - * Helper function for parseItemsToRemove(). This function extracts the selector type - * and the raw name of a selector from a CSS-style selector string and assigns those - * values to parameters passed by reference. For example, if given '#toc' as the - * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName. - * @param string $selector CSS selector to parse - * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG) - * @param string $rawName The raw name of the selector - * @return bool Whether the selector was successfully recognised - * @throws MWException - */ - protected function parseSelector( $selector, &$type, &$rawName ) { - if ( strpos( $selector, '.' ) === 0 ) { - $type = 'CLASS'; - $rawName = substr( $selector, 1 ); - } elseif ( strpos( $selector, '#' ) === 0 ) { - $type = 'ID'; - $rawName = substr( $selector, 1 ); - } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) { - $type = 'TAG_CLASS'; - $rawName = $selector; - } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) { - $type = 'TAG'; - $rawName = $selector; - } else { - throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" ); - } - - return true; - } - - /** - * Transforms CSS-style selectors into an internal representation suitable for - * processing by filterContent() - * @return array - */ - protected function parseItemsToRemove() { - $removals = [ - 'ID' => [], - 'TAG' => [], - 'CLASS' => [], - 'TAG_CLASS' => [], - ]; - - foreach ( $this->itemsToRemove as $itemToRemove ) { - $type = ''; - $rawName = ''; - if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { - $removals[$type][] = $rawName; - } - } - - if ( $this->removeMedia ) { - $removals['TAG'][] = 'img'; - $removals['TAG'][] = 'audio'; - $removals['TAG'][] = 'video'; - } - - return $removals; - } +class HtmlFormatter extends HtmlFormatter\HtmlFormatter { } diff --git a/includes/api/ApiHelp.php b/includes/api/ApiHelp.php index f2d6329ee5..f7539ce2d1 100644 --- a/includes/api/ApiHelp.php +++ b/includes/api/ApiHelp.php @@ -24,6 +24,8 @@ * @file */ +use HtmlFormatter\HtmlFormatter; + /** * Class to output help for an API module * diff --git a/tests/phpunit/includes/HtmlFormatterTest.php b/tests/phpunit/includes/HtmlFormatterTest.php deleted file mode 100644 index ab5219c4c8..0000000000 --- a/tests/phpunit/includes/HtmlFormatterTest.php +++ /dev/null @@ -1,141 +0,0 @@ -setMwGlobals( 'wgTidyInternal', $tidySupport->isInternal() ); - } - - /** - * @dataProvider getHtmlData - * - * @param string $input - * @param string $expectedText - * @param array $expectedRemoved - * @param callable|bool $callback - */ - public function testTransform( $input, $expectedText, - $expectedRemoved = [], $callback = false - ) { - $input = self::normalize( $input ); - $formatter = new HtmlFormatter( HtmlFormatter::wrapHTML( $input ) ); - if ( $callback ) { - $callback( $formatter ); - } - $removedElements = $formatter->filterContent(); - $html = $formatter->getText(); - $removed = []; - foreach ( $removedElements as $removedElement ) { - $removed[] = self::normalize( $formatter->getText( $removedElement ) ); - } - $expectedRemoved = array_map( 'self::normalize', $expectedRemoved ); - - $this->assertValidHtmlSnippet( $html ); - $this->assertEquals( self::normalize( $expectedText ), self::normalize( $html ) ); - $this->assertEquals( asort( $expectedRemoved ), asort( $removed ) ); - } - - private static function normalize( $s ) { - return str_replace( "\n", '', - str_replace( "\r", '', $s ) // "yay" to Windows! - ); - } - - public function getHtmlData() { - $removeImages = function ( HtmlFormatter $f ) { - $f->setRemoveMedia(); - }; - $removeTags = function ( HtmlFormatter $f ) { - $f->remove( [ 'table', '.foo', '#bar', 'div.baz' ] ); - }; - $flattenSomeStuff = function ( HtmlFormatter $f ) { - $f->flatten( [ 's', 'div' ] ); - }; - $flattenEverything = function ( HtmlFormatter $f ) { - $f->flattenAllTags(); - }; - return [ - // remove images if asked - [ - ' Blah

Blah

', - '', - [ ' Blah

Blah

' ], - $removeImages, - ], - // basic tag removal - [ - // @codingStandardsIgnoreStart Ignore long line warnings. - '

foo

foo

foo

bar -foobar

test

-baz', - // @codingStandardsIgnoreEnd - '

test

-baz', - [ - '

foo

', - '

foo

', - '

foo

', - 'bar', - 'foobar', - '

', - ], - $removeTags, - ], - // don't flatten tags that start like chosen ones - [ - '

~~foo~~ bar

', - 'foo bar', - [], - $flattenSomeStuff, - ], - // total flattening - [ - '

bar²

', - 'bar2', - [], - $flattenEverything, - ], - // UTF-8 preservation and security - [ - '<Ð¢ÐµÑÑ!> &<&&&&', - '<Ð¢ÐµÑÑ!> &<&&&&', - [], - $removeTags, // Have some rules to trigger a DOM parse - ], - // https://phabricator.wikimedia.org/T55086 - [ - 'Foo^[1]' - . ' Bar', - 'Foo^[1]' - . ' Bar', - ], - ]; - } - - public function testQuickProcessing() { - $f = new MockHtmlFormatter( 'foo' ); - $f->filterContent(); - $this->assertFalse( $f->hasDoc, 'HtmlFormatter should not needlessly parse HTML' ); - } -} - -class MockHtmlFormatter extends HtmlFormatter { - public $hasDoc = false; - - public function getDoc() { - $this->hasDoc = true; - return parent::getDoc(); - } -} -- 2.20.1