<?php
/**
- * Performs transformations of HTML by wrapping around libxml2 and working
- * around its countless bugs.
+ * Stub for extensions that haven't switched to Composer-based version of this class
+ * @todo: remove in 1.28
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* http://www.gnu.org/copyleft/gpl.html
*
* @file
+ * @deprecated since 1.27, use HtmlFormatter\HtmlFormatter
*/
-class HtmlFormatter {
- /**
- * @var DOMDocument
- */
- private $doc;
-
- private $html;
- private $itemsToRemove = [];
- private $elementsToFlatten = [];
- protected $removeMedia = false;
-
- /**
- * Constructor
- *
- * @param string $html Text to process
- */
- public function __construct( $html ) {
- $this->html = $html;
- }
-
- /**
- * Turns a chunk of HTML into a proper document
- * @param string $html
- * @return string
- */
- public static function wrapHTML( $html ) {
- return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
- }
-
- /**
- * Override this in descendant class to modify HTML after it has been converted from DOM tree
- * @param string $html HTML to process
- * @return string Processed HTML
- */
- protected function onHtmlReady( $html ) {
- return $html;
- }
-
- /**
- * @return DOMDocument DOM to manipulate
- */
- public function getDoc() {
- if ( !$this->doc ) {
- // DOMDocument::loadHTML isn't very good with encodings, so
- // convert input to ASCII by encoding everything above 128 as entities.
- $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
-
- // Workaround for bug that caused spaces before references
- // to disappear during processing: https://phabricator.wikimedia.org/T55086
- // TODO: Please replace with a better fix if one can be found.
- $html = str_replace( ' <', ' <', $html );
-
- libxml_use_internal_errors( true );
- $loader = libxml_disable_entity_loader();
- $this->doc = new DOMDocument();
- $this->doc->strictErrorChecking = false;
- $this->doc->loadHTML( $html );
- libxml_disable_entity_loader( $loader );
- libxml_use_internal_errors( false );
- $this->doc->encoding = 'UTF-8';
- }
- return $this->doc;
- }
-
- /**
- * Sets whether images/videos/sounds should be removed from output
- * @param bool $flag
- */
- public function setRemoveMedia( $flag = true ) {
- $this->removeMedia = $flag;
- }
-
- /**
- * Adds one or more selector of content to remove. A subset of CSS selector
- * syntax is supported:
- *
- * <tag>
- * <tag>.class
- * .<class>
- * #<id>
- *
- * @param array|string $selectors Selector(s) of stuff to remove
- */
- public function remove( $selectors ) {
- $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
- }
-
- /**
- * Adds one or more element name to the list to flatten (remove tag, but not its content)
- * Can accept undelimited regexes
- *
- * Note this interface may fail in surprising unexpected ways due to usage of regexes,
- * so should not be relied on for HTML markup security measures.
- *
- * @param array|string $elements Name(s) of tag(s) to flatten
- */
- public function flatten( $elements ) {
- $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
- }
-
- /**
- * Instructs the formatter to flatten all tags
- */
- public function flattenAllTags() {
- $this->flatten( '[?!]?[a-z0-9]+' );
- }
-
- /**
- * Removes content we've chosen to remove. The text of the removed elements can be
- * extracted with the getText method.
- * @return array Array of removed DOMElements
- */
- public function filterContent() {
- $removals = $this->parseItemsToRemove();
-
- // Bail out early if nothing to do
- if ( array_reduce( $removals,
- function ( $carry, $item ) {
- return $carry && !$item;
- },
- true
- ) ) {
- return [];
- }
-
- $doc = $this->getDoc();
-
- // Remove tags
-
- // You can't remove DOMNodes from a DOMNodeList as you're iterating
- // over them in a foreach loop. It will seemingly leave the internal
- // iterator on the foreach out of wack and results will be quite
- // strange. Though, making a queue of items to remove seems to work.
- $domElemsToRemove = [];
- foreach ( $removals['TAG'] as $tagToRemove ) {
- $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
- foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
- if ( $tagToRemoveNode ) {
- $domElemsToRemove[] = $tagToRemoveNode;
- }
- }
- }
- $removed = $this->removeElements( $domElemsToRemove );
-
- // Elements with named IDs
- $domElemsToRemove = [];
- foreach ( $removals['ID'] as $itemToRemove ) {
- $itemToRemoveNode = $doc->getElementById( $itemToRemove );
- if ( $itemToRemoveNode ) {
- $domElemsToRemove[] = $itemToRemoveNode;
- }
- }
- $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
-
- // CSS Classes
- $domElemsToRemove = [];
- $xpath = new DOMXPath( $doc );
- foreach ( $removals['CLASS'] as $classToRemove ) {
- $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
-
- /** @var $element DOMElement */
- foreach ( $elements as $element ) {
- $classes = $element->getAttribute( 'class' );
- if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
- $domElemsToRemove[] = $element;
- }
- }
- }
- $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
-
- // Tags with CSS Classes
- foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
- $parts = explode( '.', $classToRemove );
-
- $elements = $xpath->query(
- '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
- );
- $removed = array_merge( $removed, $this->removeElements( $elements ) );
- }
-
- return $removed;
- }
-
- /**
- * Removes a list of elelments from DOMDocument
- * @param array|DOMNodeList $elements
- * @return array Array of removed elements
- */
- private function removeElements( $elements ) {
- $list = $elements;
- if ( $elements instanceof DOMNodeList ) {
- $list = [];
- foreach ( $elements as $element ) {
- $list[] = $element;
- }
- }
- /** @var $element DOMElement */
- foreach ( $list as $element ) {
- if ( $element->parentNode ) {
- $element->parentNode->removeChild( $element );
- }
- }
- return $list;
- }
-
- /**
- * libxml in its usual pointlessness converts many chars to entities - this function
- * perfoms a reverse conversion
- * @param string $html
- * @return string
- */
- private function fixLibXML( $html ) {
- static $replacements;
- if ( !$replacements ) {
- // We don't include rules like '"' => '&quot;' because entities had already been
- // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
- $replacements = new ReplacementArray( [
- '"' => '&quot;',
- '&' => '&amp;',
- '<' => '&lt;',
- '>' => '&gt;',
- ] );
- }
- $html = $replacements->replace( $html );
-
- // Just in case the conversion in getDoc() above used named
- // entities that aren't known to html_entity_decode().
- $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
-
- return $html;
- }
-
- /**
- * Performs final transformations and returns resulting HTML. Note that if you want to call this
- * both without an element and with an element you should call it without an element first. If you
- * specify the $element in the method it'll change the underlying dom and you won't be able to get
- * it back.
- *
- * @param DOMElement|string|null $element ID of element to get HTML from or
- * false to get it from the whole tree
- * @return string Processed HTML
- */
- public function getText( $element = null ) {
-
- if ( $this->doc ) {
- if ( $element !== null && !( $element instanceof DOMElement ) ) {
- $element = $this->doc->getElementById( $element );
- }
- if ( $element ) {
- $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
- $nodesArray = [];
- foreach ( $body->childNodes as $node ) {
- $nodesArray[] = $node;
- }
- foreach ( $nodesArray as $nodeArray ) {
- $body->removeChild( $nodeArray );
- }
- $body->appendChild( $element );
- }
- $html = $this->doc->saveHTML();
-
- $html = $this->fixLibXML( $html );
- if ( wfIsWindows() ) {
- // Cleanup for CRLF misprocessing of unknown origin on Windows.
- // If this error continues in the future, please track it down in the
- // XML code paths if possible and fix there.
- $html = str_replace( ' ', '', $html );
- }
- } else {
- $html = $this->html;
- }
- // Remove stuff added by wrapHTML()
- $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
- $html = $this->onHtmlReady( $html );
-
- if ( $this->elementsToFlatten ) {
- $elements = implode( '|', $this->elementsToFlatten );
- $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
- }
-
- return $html;
- }
-
- /**
- * Helper function for parseItemsToRemove(). This function extracts the selector type
- * and the raw name of a selector from a CSS-style selector string and assigns those
- * values to parameters passed by reference. For example, if given '#toc' as the
- * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
- * @param string $selector CSS selector to parse
- * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
- * @param string $rawName The raw name of the selector
- * @return bool Whether the selector was successfully recognised
- * @throws MWException
- */
- protected function parseSelector( $selector, &$type, &$rawName ) {
- if ( strpos( $selector, '.' ) === 0 ) {
- $type = 'CLASS';
- $rawName = substr( $selector, 1 );
- } elseif ( strpos( $selector, '#' ) === 0 ) {
- $type = 'ID';
- $rawName = substr( $selector, 1 );
- } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
- $type = 'TAG_CLASS';
- $rawName = $selector;
- } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
- $type = 'TAG';
- $rawName = $selector;
- } else {
- throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
- }
-
- return true;
- }
-
- /**
- * Transforms CSS-style selectors into an internal representation suitable for
- * processing by filterContent()
- * @return array
- */
- protected function parseItemsToRemove() {
- $removals = [
- 'ID' => [],
- 'TAG' => [],
- 'CLASS' => [],
- 'TAG_CLASS' => [],
- ];
-
- foreach ( $this->itemsToRemove as $itemToRemove ) {
- $type = '';
- $rawName = '';
- if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
- $removals[$type][] = $rawName;
- }
- }
-
- if ( $this->removeMedia ) {
- $removals['TAG'][] = 'img';
- $removals['TAG'][] = 'audio';
- $removals['TAG'][] = 'video';
- }
-
- return $removals;
- }
+class HtmlFormatter extends HtmlFormatter\HtmlFormatter {
}
+++ /dev/null
-<?php
-
-/**
- * @group HtmlFormatter
- */
-class HtmlFormatterTest extends MediaWikiTestCase {
-
- /**
- * Use TidySupport to check whether we should use $wgTidyInternal.
- *
- * The Tidy extension in HHVM does not support error text return, so it is
- * nominally usable, but does not pass tests which require error text from
- * Tidy.
- */
- protected function setUp() {
- parent::setUp();
- $tidySupport = new TidySupport();
- $this->setMwGlobals( 'wgTidyInternal', $tidySupport->isInternal() );
- }
-
- /**
- * @dataProvider getHtmlData
- *
- * @param string $input
- * @param string $expectedText
- * @param array $expectedRemoved
- * @param callable|bool $callback
- */
- public function testTransform( $input, $expectedText,
- $expectedRemoved = [], $callback = false
- ) {
- $input = self::normalize( $input );
- $formatter = new HtmlFormatter( HtmlFormatter::wrapHTML( $input ) );
- if ( $callback ) {
- $callback( $formatter );
- }
- $removedElements = $formatter->filterContent();
- $html = $formatter->getText();
- $removed = [];
- foreach ( $removedElements as $removedElement ) {
- $removed[] = self::normalize( $formatter->getText( $removedElement ) );
- }
- $expectedRemoved = array_map( 'self::normalize', $expectedRemoved );
-
- $this->assertValidHtmlSnippet( $html );
- $this->assertEquals( self::normalize( $expectedText ), self::normalize( $html ) );
- $this->assertEquals( asort( $expectedRemoved ), asort( $removed ) );
- }
-
- private static function normalize( $s ) {
- return str_replace( "\n", '',
- str_replace( "\r", '', $s ) // "yay" to Windows!
- );
- }
-
- public function getHtmlData() {
- $removeImages = function ( HtmlFormatter $f ) {
- $f->setRemoveMedia();
- };
- $removeTags = function ( HtmlFormatter $f ) {
- $f->remove( [ 'table', '.foo', '#bar', 'div.baz' ] );
- };
- $flattenSomeStuff = function ( HtmlFormatter $f ) {
- $f->flatten( [ 's', 'div' ] );
- };
- $flattenEverything = function ( HtmlFormatter $f ) {
- $f->flattenAllTags();
- };
- return [
- // remove images if asked
- [
- '<img src="/foo/bar.jpg" alt="Blah"/>',
- '',
- [ '<img src="/foo/bar.jpg" alt="Blah">' ],
- $removeImages,
- ],
- // basic tag removal
- [
- // @codingStandardsIgnoreStart Ignore long line warnings.
- '<table><tr><td>foo</td></tr></table><div class="foo">foo</div><div class="foo quux">foo</div><span id="bar">bar</span>
-<strong class="foo" id="bar">foobar</strong><div class="notfoo">test</div><div class="baz"/>
-<span class="baz">baz</span>',
- // @codingStandardsIgnoreEnd
- '<div class="notfoo">test</div>
-<span class="baz">baz</span>',
- [
- '<table><tr><td>foo</td></tr></table>',
- '<div class="foo">foo</div>',
- '<div class="foo quux">foo</div>',
- '<span id="bar">bar</span>',
- '<strong class="foo" id="bar">foobar</strong>',
- '<div class="baz"/>',
- ],
- $removeTags,
- ],
- // don't flatten tags that start like chosen ones
- [
- '<div><s>foo</s> <span>bar</span></div>',
- 'foo <span>bar</span>',
- [],
- $flattenSomeStuff,
- ],
- // total flattening
- [
- '<div style="foo">bar<sup>2</sup></div>',
- 'bar2',
- [],
- $flattenEverything,
- ],
- // UTF-8 preservation and security
- [
- '<span title="" \' &"><Тест!></span> &<&&&&',
- '<span title="" \' &"><Тест!></span> &<&&&&',
- [],
- $removeTags, // Have some rules to trigger a DOM parse
- ],
- // https://phabricator.wikimedia.org/T55086
- [
- 'Foo<sup id="cite_ref-1" class="reference"><a href="#cite_note-1">[1]</a></sup>'
- . ' <a href="/wiki/Bar" title="Bar" class="mw-redirect">Bar</a>',
- 'Foo<sup id="cite_ref-1" class="reference"><a href="#cite_note-1">[1]</a></sup>'
- . ' <a href="/wiki/Bar" title="Bar" class="mw-redirect">Bar</a>',
- ],
- ];
- }
-
- public function testQuickProcessing() {
- $f = new MockHtmlFormatter( 'foo' );
- $f->filterContent();
- $this->assertFalse( $f->hasDoc, 'HtmlFormatter should not needlessly parse HTML' );
- }
-}
-
-class MockHtmlFormatter extends HtmlFormatter {
- public $hasDoc = false;
-
- public function getDoc() {
- $this->hasDoc = true;
- return parent::getDoc();
- }
-}