From 19627682ac77b4c1a0e2a7041dcae6fc7570a8ec Mon Sep 17 00:00:00 2001 From: Max Semenik Date: Mon, 16 Sep 2013 09:54:32 -0700 Subject: [PATCH] Move HtmlFormatter from MobileFrontend This class is needed by CirrusSearch, another application will be API prop=extracts which doesn't really belong to MobileFrontend and will also be integrated into core soon. Change-Id: Ic276e1604c5718e8568e120ddfb9a8fc13a682fc --- includes/AutoLoader.php | 1 + includes/HtmlFormatter.php | 333 +++++++++++++++++++ tests/phpunit/includes/HtmlFormatterTest.php | 81 +++++ 3 files changed, 415 insertions(+) create mode 100644 includes/HtmlFormatter.php create mode 100644 tests/phpunit/includes/HtmlFormatterTest.php diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index 604add33d8..48e744e81f 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -111,6 +111,7 @@ $wgAutoloadLocalClasses = array( 'HistoryBlobStub' => 'includes/HistoryBlob.php', 'Hooks' => 'includes/Hooks.php', 'Html' => 'includes/Html.php', + 'HtmlFormatter' => 'includes/HtmlFormatter.php', 'HTMLApiField' => 'includes/HTMLForm.php', 'HTMLButtonField' => 'includes/HTMLForm.php', 'HTMLCheckField' => 'includes/HTMLForm.php', diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php new file mode 100644 index 0000000000..5f97140ee4 --- /dev/null +++ b/includes/HtmlFormatter.php @@ -0,0 +1,333 @@ +html = $html; + } + + /** + * Turns a chunk of HTML into a proper document + * @param string $html + * @return string + */ + public static function wrapHTML( $html ) { + return '' . $html . ''; + } + + /** + * Override this in descendant class to modify HTML after it has been converted from DOM tree + * @param string $html: HTML to process + * @return string: Processed HTML + */ + protected function onHtmlReady( $html ) { + return $html; + } + + /** + * @return DOMDocument: DOM to manipulate + */ + public function getDoc() { + if ( !$this->doc ) { + $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' ); + + // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086 + $html = str_replace( ' <', ' <', $html ); + + libxml_use_internal_errors( true ); + $this->doc = new DOMDocument(); + $this->doc->strictErrorChecking = false; + $this->doc->loadHTML( $html ); + libxml_use_internal_errors( false ); + $this->doc->encoding = 'UTF-8'; + } + return $this->doc; + } + + /** + * Sets whether images/videos/sounds should be removed from output + * @param bool $flag + */ + public function setRemoveMedia( $flag = true ) { + $this->removeMedia = $flag; + } + + /** + * Adds one or more selector of content to remove + * @param Array|string $selectors: Selector(s) of stuff to remove + */ + public function remove( $selectors ) { + $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors ); + } + + /** + * Adds one or more element name to the list to flatten (remove tag, but not its content) + * Can accept undelimited regexes + * @param Array|string $elements: Name(s) of tag(s) to flatten + */ + public function flatten( $elements ) { + $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements ); + } + + /** + * Instructs the formatter to flatten all tags + */ + public function flattenAllTags() { + $this->flatten( '[?!]?[a-z0-9]+' ); + } + + /** + * Removes content we've chosen to remove + */ + public function filterContent() { + wfProfileIn( __METHOD__ ); + $removals = $this->parseItemsToRemove(); + + if ( !$removals ) { + return; + } + + $doc = $this->getDoc(); + + // Remove tags + + // You can't remove DOMNodes from a DOMNodeList as you're iterating + // over them in a foreach loop. It will seemingly leave the internal + // iterator on the foreach out of wack and results will be quite + // strange. Though, making a queue of items to remove seems to work. + $domElemsToRemove = array(); + foreach ( $removals['TAG'] as $tagToRemove ) { + $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove ); + foreach ( $tagToRemoveNodes as $tagToRemoveNode ) { + if ( $tagToRemoveNode ) { + $domElemsToRemove[] = $tagToRemoveNode; + } + } + } + + $this->removeElements( $domElemsToRemove ); + + // Elements with named IDs + $domElemsToRemove = array(); + foreach ( $removals['ID'] as $itemToRemove ) { + $itemToRemoveNode = $doc->getElementById( $itemToRemove ); + if ( $itemToRemoveNode ) { + $domElemsToRemove[] = $itemToRemoveNode; + } + } + $this->removeElements( $domElemsToRemove ); + + // CSS Classes + $domElemsToRemove = array(); + $xpath = new DOMXpath( $doc ); + foreach ( $removals['CLASS'] as $classToRemove ) { + $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' ); + + /** @var $element DOMElement */ + foreach ( $elements as $element ) { + $classes = $element->getAttribute( 'class' ); + if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) { + $domElemsToRemove[] = $element; + } + } + } + $this->removeElements( $domElemsToRemove ); + + // Tags with CSS Classes + foreach ( $removals['TAG_CLASS'] as $classToRemove ) { + $parts = explode( '.', $classToRemove ); + + $elements = $xpath->query( + '//' . $parts[0] . '[@class="' . $parts[1] . '"]' + ); + + $this->removeElements( $elements ); + } + + wfProfileOut( __METHOD__ ); + } + + /** + * Removes a list of elelments from DOMDocument + * @param array|DOMNodeList $elements + */ + private function removeElements( $elements ) { + $list = $elements; + if ( $elements instanceof DOMNodeList ) { + $list = array(); + foreach ( $elements as $element ) { + $list[] = $element; + } + } + /** @var $element DOMElement */ + foreach ( $list as $element ) { + if ( $element->parentNode ) { + $element->parentNode->removeChild( $element ); + } + } + } + + /** + * libxml in its usual pointlessness converts many chars to entities - this function + * perfoms a reverse conversion + * @param string $html + * @return string + */ + private function fixLibXML( $html ) { + wfProfileIn( __METHOD__ ); + static $replacements; + if ( ! $replacements ) { + // We don't include rules like '"' => '&quot;' because entities had already been + // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE! + $replacements = new ReplacementArray( array( + '"' => '&quot;', + '&' => '&amp;', + '<' => '&lt;', + '>' => '&gt;', + ) ); + } + $html = $replacements->replace( $html ); + $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' ); + wfProfileOut( __METHOD__ ); + return $html; + } + + /** + * Performs final transformations and returns resulting HTML + * + * @param DOMElement|string|null $element: ID of element to get HTML from or false to get it from the whole tree + * @return string: Processed HTML + */ + public function getText( $element = null ) { + wfProfileIn( __METHOD__ ); + + if ( $this->doc ) { + if ( $element !== null && !( $element instanceof DOMElement ) ) { + $element = $this->doc->getElementById( $element ); + } + if ( $element ) { + $body = $this->doc->getElementsByTagName( 'body' )->item( 0 ); + $nodesArray = array(); + foreach ( $body->childNodes as $node ) { + $nodesArray[] = $node; + } + foreach ( $nodesArray as $nodeArray ) { + $body->removeChild( $nodeArray ); + } + $body->appendChild( $element ); + } + $html = $this->doc->saveHTML(); + $html = $this->fixLibXml( $html ); + } else { + $html = $this->html; + } + if ( wfIsWindows() ) { + $html = str_replace( ' ', '', $html ); + } + $html = preg_replace( '/|^.*?|<\/body>.*$/s', '', $html ); + $html = $this->onHtmlReady( $html ); + + if ( $this->elementsToFlatten ) { + $elements = implode( '|', $this->elementsToFlatten ); + $html = preg_replace( "#]*>#is", '', $html ); + } + + wfProfileOut( __METHOD__ ); + return $html; + } + + /** + * @param $selector: CSS selector to parse + * @param $type + * @param $rawName + * @return bool: Whether the selector was successfully recognised + */ + protected function parseSelector( $selector, &$type, &$rawName ) { + if ( strpos( $selector, '.' ) === 0 ) { + $type = 'CLASS'; + $rawName = substr( $selector, 1 ); + } elseif ( strpos( $selector, '#' ) === 0 ) { + $type = 'ID'; + $rawName = substr( $selector, 1 ); + } elseif ( strpos( $selector, '.' ) !== 0 && + strpos( $selector, '.' ) !== false ) + { + $type = 'TAG_CLASS'; + $rawName = $selector; + } elseif ( strpos( $selector, '[' ) === false + && strpos( $selector, ']' ) === false ) + { + $type = 'TAG'; + $rawName = $selector; + } else { + throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" ); + } + + return true; + } + + /** + * Transforms CSS selectors into an internal representation suitable for processing + * @return array + */ + protected function parseItemsToRemove() { + wfProfileIn( __METHOD__ ); + $removals = array( + 'ID' => array(), + 'TAG' => array(), + 'CLASS' => array(), + 'TAG_CLASS' => array(), + ); + + foreach ( $this->itemsToRemove as $itemToRemove ) { + $type = ''; + $rawName = ''; + if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) { + $removals[$type][] = $rawName; + } + } + + if ( $this->removeMedia ) { + $removals['TAG'][] = 'img'; + $removals['TAG'][] = 'audio'; + $removals['TAG'][] = 'video'; + } + + wfProfileOut( __METHOD__ ); + return $removals; + } +} diff --git a/tests/phpunit/includes/HtmlFormatterTest.php b/tests/phpunit/includes/HtmlFormatterTest.php new file mode 100644 index 0000000000..a37df74fc1 --- /dev/null +++ b/tests/phpunit/includes/HtmlFormatterTest.php @@ -0,0 +1,81 @@ +filterContent(); + $html = $formatter->getText(); + $this->assertEquals( self::normalize( $expected ), self::normalize( $html ) ); + } + + private static function normalize( $s ) { + return str_replace( "\n", '', + str_replace( "\r", '', $s ) // "yay" to Windows! + ); + } + + public function getHtmlData() { + $removeImages = function( HtmlFormatter $f ) { + $f->setRemoveMedia(); + }; + $removeTags = function( HtmlFormatter $f ) { + $f->remove( array( 'table', '.foo', '#bar', 'div.baz' ) ); + }; + $flattenSomeStuff = function( HtmlFormatter $f ) { + $f->flatten( array( 's', 'div' ) ); + }; + $flattenEverything = function( HtmlFormatter $f ) { + $f->flattenAllTags(); + }; + return array( + // remove images if asked + array( + 'Blah', + '', + $removeImages, + ), + // basic tag removal + array( + '
foo
foo
foo
bar +foobar
test
+baz', + + '
test
+baz', + $removeTags, + ), + // don't flatten tags that start like chosen ones + array( + '
foo bar
', + 'foo bar', + $flattenSomeStuff, + ), + // total flattening + array( + '
bar2
', + 'bar2', + $flattenEverything, + ), + // UTF-8 preservation and security + array( + '<Тест!> &<&&&&', + '<Тест!> &<&&&&', + ), + // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086 + array( + 'Foo[1] Bar', + 'Foo[1] Bar', + ), + ); + } +} -- 2.20.1