Switch to external HtmlFormatter

author Max Semenik <maxsem.wiki@gmail.com>

Tue, 12 Apr 2016 20:20:20 +0000 (13:20 -0700)

committer BryanDavis <bdavis@wikimedia.org>

Wed, 13 Apr 2016 02:37:59 +0000 (02:37 +0000)
author Max Semenik <maxsem.wiki@gmail.com>
Tue, 12 Apr 2016 20:20:20 +0000 (13:20 -0700)
committer BryanDavis <bdavis@wikimedia.org>
Wed, 13 Apr 2016 02:37:59 +0000 (02:37 +0000)
diff --git a/composer.json b/composer.json

index 1378e96..6b7898d 100644 (file)
--- a/composer.json
+++ b/composer.json
@@ -34,6 +34,7 @@
                 "wikimedia/cdb": "1.3.0",
                 "wikimedia/cldr-plural-rule-parser": "1.0.0",
                 "wikimedia/composer-merge-plugin": "1.3.1",
+               "wikimedia/html-formatter": "1.0.1",
                 "wikimedia/ip-set": "1.0.1",
                 "wikimedia/php-session-serializer": "1.0.3",
                 "wikimedia/relpath": "1.0.3",
diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php

index 206f0f7..9bae8b5 100644 (file)
--- a/includes/HtmlFormatter.php
+++ b/includes/HtmlFormatter.php
@@ -1,7 +1,7 @@
  <?php
  /**
- * Performs transformations of HTML by wrapping around libxml2 and working
- * around its countless bugs.
+ * Stub for extensions that haven't switched to Composer-based version of this class
+ * @todo: remove in 1.28
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -19,348 +19,7 @@
   * http://www.gnu.org/copyleft/gpl.html
   *
   * @file
+ * @deprecated since 1.27, use HtmlFormatter\HtmlFormatter
   */
-class HtmlFormatter {
-       /**
-        * @var DOMDocument
-        */
-       private $doc;
-
-       private $html;
-       private $itemsToRemove = [];
-       private $elementsToFlatten = [];
-       protected $removeMedia = false;
-
-       /**
-        * Constructor
-        *
-        * @param string $html Text to process
-        */
-       public function __construct( $html ) {
-               $this->html = $html;
-       }
-
-       /**
-        * Turns a chunk of HTML into a proper document
-        * @param string $html
-        * @return string
-        */
-       public static function wrapHTML( $html ) {
-               return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
-       }
-
-       /**
-        * Override this in descendant class to modify HTML after it has been converted from DOM tree
-        * @param string $html HTML to process
-        * @return string Processed HTML
-        */
-       protected function onHtmlReady( $html ) {
-               return $html;
-       }
-
-       /**
-        * @return DOMDocument DOM to manipulate
-        */
-       public function getDoc() {
-               if ( !$this->doc ) {
-                       // DOMDocument::loadHTML isn't very good with encodings, so
-                       // convert input to ASCII by encoding everything above 128 as entities.
-                       $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
-
-                       // Workaround for bug that caused spaces before references
-                       // to disappear during processing: https://phabricator.wikimedia.org/T55086
-                       // TODO: Please replace with a better fix if one can be found.
-                       $html = str_replace( ' <', '&#32;<', $html );
-
-                       libxml_use_internal_errors( true );
-                       $loader = libxml_disable_entity_loader();
-                       $this->doc = new DOMDocument();
-                       $this->doc->strictErrorChecking = false;
-                       $this->doc->loadHTML( $html );
-                       libxml_disable_entity_loader( $loader );
-                       libxml_use_internal_errors( false );
-                       $this->doc->encoding = 'UTF-8';
-               }
-               return $this->doc;
-       }
-
-       /**
-        * Sets whether images/videos/sounds should be removed from output
-        * @param bool $flag
-        */
-       public function setRemoveMedia( $flag = true ) {
-               $this->removeMedia = $flag;
-       }
-
-       /**
-        * Adds one or more selector of content to remove. A subset of CSS selector
-        * syntax is supported:
-        *
-        *   <tag>
-        *   <tag>.class
-        *   .<class>
-        *   #<id>
-        *
-        * @param array|string $selectors Selector(s) of stuff to remove
-        */
-       public function remove( $selectors ) {
-               $this->itemsToRemove = array_merge( $this->itemsToRemove, (array)$selectors );
-       }
-
-       /**
-        * Adds one or more element name to the list to flatten (remove tag, but not its content)
-        * Can accept undelimited regexes
-        *
-        * Note this interface may fail in surprising unexpected ways due to usage of regexes,
-        * so should not be relied on for HTML markup security measures.
-        *
-        * @param array|string $elements Name(s) of tag(s) to flatten
-        */
-       public function flatten( $elements ) {
-               $this->elementsToFlatten = array_merge( $this->elementsToFlatten, (array)$elements );
-       }
-
-       /**
-        * Instructs the formatter to flatten all tags
-        */
-       public function flattenAllTags() {
-               $this->flatten( '[?!]?[a-z0-9]+' );
-       }
-
-       /**
-        * Removes content we've chosen to remove.  The text of the removed elements can be
-        * extracted with the getText method.
-        * @return array Array of removed DOMElements
-        */
-       public function filterContent() {
-               $removals = $this->parseItemsToRemove();
-
-               // Bail out early if nothing to do
-               if ( array_reduce( $removals,
-                       function ( $carry, $item ) {
-                               return $carry && !$item;
-                       },
-                       true
-               ) ) {
-                       return [];
-               }
-
-               $doc = $this->getDoc();
-
-               // Remove tags
-
-               // You can't remove DOMNodes from a DOMNodeList as you're iterating
-               // over them in a foreach loop. It will seemingly leave the internal
-               // iterator on the foreach out of wack and results will be quite
-               // strange. Though, making a queue of items to remove seems to work.
-               $domElemsToRemove = [];
-               foreach ( $removals['TAG'] as $tagToRemove ) {
-                       $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
-                       foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
-                               if ( $tagToRemoveNode ) {
-                                       $domElemsToRemove[] = $tagToRemoveNode;
-                               }
-                       }
-               }
-               $removed = $this->removeElements( $domElemsToRemove );
-
-               // Elements with named IDs
-               $domElemsToRemove = [];
-               foreach ( $removals['ID'] as $itemToRemove ) {
-                       $itemToRemoveNode = $doc->getElementById( $itemToRemove );
-                       if ( $itemToRemoveNode ) {
-                               $domElemsToRemove[] = $itemToRemoveNode;
-                       }
-               }
-               $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
-
-               // CSS Classes
-               $domElemsToRemove = [];
-               $xpath = new DOMXPath( $doc );
-               foreach ( $removals['CLASS'] as $classToRemove ) {
-                       $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
-
-                       /** @var $element DOMElement */
-                       foreach ( $elements as $element ) {
-                               $classes = $element->getAttribute( 'class' );
-                               if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode ) {
-                                       $domElemsToRemove[] = $element;
-                               }
-                       }
-               }
-               $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
-
-               // Tags with CSS Classes
-               foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
-                       $parts = explode( '.', $classToRemove );
-
-                       $elements = $xpath->query(
-                               '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
-                       );
-                       $removed = array_merge( $removed, $this->removeElements( $elements ) );
-               }
-
-               return $removed;
-       }
-
-       /**
-        * Removes a list of elelments from DOMDocument
-        * @param array|DOMNodeList $elements
-        * @return array Array of removed elements
-        */
-       private function removeElements( $elements ) {
-               $list = $elements;
-               if ( $elements instanceof DOMNodeList ) {
-                       $list = [];
-                       foreach ( $elements as $element ) {
-                               $list[] = $element;
-                       }
-               }
-               /** @var $element DOMElement */
-               foreach ( $list as $element ) {
-                       if ( $element->parentNode ) {
-                               $element->parentNode->removeChild( $element );
-                       }
-               }
-               return $list;
-       }
-
-       /**
-        * libxml in its usual pointlessness converts many chars to entities - this function
-        * perfoms a reverse conversion
-        * @param string $html
-        * @return string
-        */
-       private function fixLibXML( $html ) {
-               static $replacements;
-               if ( !$replacements ) {
-                       // We don't include rules like '&#34;' => '&amp;quot;' because entities had already been
-                       // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
-                       $replacements = new ReplacementArray( [
-                               '&quot;' => '&amp;quot;',
-                               '&amp;' => '&amp;amp;',
-                               '&lt;' => '&amp;lt;',
-                               '&gt;' => '&amp;gt;',
-                       ] );
-               }
-               $html = $replacements->replace( $html );
-
-               // Just in case the conversion in getDoc() above used named
-               // entities that aren't known to html_entity_decode().
-               $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
-
-               return $html;
-       }
-
-       /**
-        * Performs final transformations and returns resulting HTML.  Note that if you want to call this
-        * both without an element and with an element you should call it without an element first.  If you
-        * specify the $element in the method it'll change the underlying dom and you won't be able to get
-        * it back.
-        *
-        * @param DOMElement|string|null $element ID of element to get HTML from or
-        *   false to get it from the whole tree
-        * @return string Processed HTML
-        */
-       public function getText( $element = null ) {
-
-               if ( $this->doc ) {
-                       if ( $element !== null && !( $element instanceof DOMElement ) ) {
-                               $element = $this->doc->getElementById( $element );
-                       }
-                       if ( $element ) {
-                               $body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
-                               $nodesArray = [];
-                               foreach ( $body->childNodes as $node ) {
-                                       $nodesArray[] = $node;
-                               }
-                               foreach ( $nodesArray as $nodeArray ) {
-                                       $body->removeChild( $nodeArray );
-                               }
-                               $body->appendChild( $element );
-                       }
-                       $html = $this->doc->saveHTML();
-
-                       $html = $this->fixLibXML( $html );
-                       if ( wfIsWindows() ) {
-                               // Cleanup for CRLF misprocessing of unknown origin on Windows.
-                               // If this error continues in the future, please track it down in the
-                               // XML code paths if possible and fix there.
-                               $html = str_replace( '&#13;', '', $html );
-                       }
-               } else {
-                       $html = $this->html;
-               }
-               // Remove stuff added by wrapHTML()
-               $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
-               $html = $this->onHtmlReady( $html );
-
-               if ( $this->elementsToFlatten ) {
-                       $elements = implode( '|', $this->elementsToFlatten );
-                       $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
-               }
-
-               return $html;
-       }
-
-       /**
-        * Helper function for parseItemsToRemove(). This function extracts the selector type
-        * and the raw name of a selector from a CSS-style selector string and assigns those
-        * values to parameters passed by reference. For example, if given '#toc' as the
-        * $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
-        * @param string $selector CSS selector to parse
-        * @param string $type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
-        * @param string $rawName The raw name of the selector
-        * @return bool Whether the selector was successfully recognised
-        * @throws MWException
-        */
-       protected function parseSelector( $selector, &$type, &$rawName ) {
-               if ( strpos( $selector, '.' ) === 0 ) {
-                       $type = 'CLASS';
-                       $rawName = substr( $selector, 1 );
-               } elseif ( strpos( $selector, '#' ) === 0 ) {
-                       $type = 'ID';
-                       $rawName = substr( $selector, 1 );
-               } elseif ( strpos( $selector, '.' ) !== 0 && strpos( $selector, '.' ) !== false ) {
-                       $type = 'TAG_CLASS';
-                       $rawName = $selector;
-               } elseif ( strpos( $selector, '[' ) === false && strpos( $selector, ']' ) === false ) {
-                       $type = 'TAG';
-                       $rawName = $selector;
-               } else {
-                       throw new MWException( __METHOD__ . "(): unrecognized selector '$selector'" );
-               }
-
-               return true;
-       }
-
-       /**
-        * Transforms CSS-style selectors into an internal representation suitable for
-        * processing by filterContent()
-        * @return array
-        */
-       protected function parseItemsToRemove() {
-               $removals = [
-                       'ID' => [],
-                       'TAG' => [],
-                       'CLASS' => [],
-                       'TAG_CLASS' => [],
-               ];
-
-               foreach ( $this->itemsToRemove as $itemToRemove ) {
-                       $type = '';
-                       $rawName = '';
-                       if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
-                               $removals[$type][] = $rawName;
-                       }
-               }
-
-               if ( $this->removeMedia ) {
-                       $removals['TAG'][] = 'img';
-                       $removals['TAG'][] = 'audio';
-                       $removals['TAG'][] = 'video';
-               }
-
-               return $removals;
-       }
+class HtmlFormatter extends HtmlFormatter\HtmlFormatter {
  }
diff --git a/includes/api/ApiHelp.php b/includes/api/ApiHelp.php

index f2d6329..f7539ce 100644 (file)
--- a/includes/api/ApiHelp.php
+++ b/includes/api/ApiHelp.php
@@ -24,6 +24,8 @@
   * @file
   */
  
+use HtmlFormatter\HtmlFormatter;
+
  /**
   * Class to output help for an API module
   *
diff --git a/tests/phpunit/includes/HtmlFormatterTest.php b/tests/phpunit/includes/HtmlFormatterTest.php

deleted file mode 100644 (file)

index ab5219c..0000000
--- a/tests/phpunit/includes/HtmlFormatterTest.php
+++ /dev/null
@@ -1,141 +0,0 @@
-<?php
-
-/**
- * @group HtmlFormatter
- */
-class HtmlFormatterTest extends MediaWikiTestCase {
-
-       /**
-        * Use TidySupport to check whether we should use $wgTidyInternal.
-        *
-        * The Tidy extension in HHVM does not support error text return, so it is
-        * nominally usable, but does not pass tests which require error text from
-        * Tidy.
-        */
-       protected function setUp() {
-               parent::setUp();
-               $tidySupport = new TidySupport();
-               $this->setMwGlobals( 'wgTidyInternal', $tidySupport->isInternal() );
-       }
-
-       /**
-        * @dataProvider getHtmlData
-        *
-        * @param string $input
-        * @param string $expectedText
-        * @param array $expectedRemoved
-        * @param callable|bool $callback
-        */
-       public function testTransform( $input, $expectedText,
-               $expectedRemoved = [], $callback = false
-       ) {
-               $input = self::normalize( $input );
-               $formatter = new HtmlFormatter( HtmlFormatter::wrapHTML( $input ) );
-               if ( $callback ) {
-                       $callback( $formatter );
-               }
-               $removedElements = $formatter->filterContent();
-               $html = $formatter->getText();
-               $removed = [];
-               foreach ( $removedElements as $removedElement ) {
-                       $removed[] = self::normalize( $formatter->getText( $removedElement ) );
-               }
-               $expectedRemoved = array_map( 'self::normalize', $expectedRemoved );
-
-               $this->assertValidHtmlSnippet( $html );
-               $this->assertEquals( self::normalize( $expectedText ), self::normalize( $html ) );
-               $this->assertEquals( asort( $expectedRemoved ), asort( $removed ) );
-       }
-
-       private static function normalize( $s ) {
-               return str_replace( "\n", '',
-                       str_replace( "\r", '', $s ) // "yay" to Windows!
-               );
-       }
-
-       public function getHtmlData() {
-               $removeImages = function ( HtmlFormatter $f ) {
-                       $f->setRemoveMedia();
-               };
-               $removeTags = function ( HtmlFormatter $f ) {
-                       $f->remove( [ 'table', '.foo', '#bar', 'div.baz' ] );
-               };
-               $flattenSomeStuff = function ( HtmlFormatter $f ) {
-                       $f->flatten( [ 's', 'div' ] );
-               };
-               $flattenEverything = function ( HtmlFormatter $f ) {
-                       $f->flattenAllTags();
-               };
-               return [
-                       // remove images if asked
-                       [
-                               '<img src="/foo/bar.jpg" alt="Blah"/>',
-                               '',
-                               [ '<img src="/foo/bar.jpg" alt="Blah">' ],
-                               $removeImages,
-                       ],
-                       // basic tag removal
-                       [
-                               // @codingStandardsIgnoreStart Ignore long line warnings.
-                               '<table><tr><td>foo</td></tr></table><div class="foo">foo</div><div class="foo quux">foo</div><span id="bar">bar</span>
-<strong class="foo" id="bar">foobar</strong><div class="notfoo">test</div><div class="baz"/>
-<span class="baz">baz</span>',
-                               // @codingStandardsIgnoreEnd
-                               '<div class="notfoo">test</div>
-<span class="baz">baz</span>',
-                               [
-                                       '<table><tr><td>foo</td></tr></table>',
-                                       '<div class="foo">foo</div>',
-                                       '<div class="foo quux">foo</div>',
-                                       '<span id="bar">bar</span>',
-                                       '<strong class="foo" id="bar">foobar</strong>',
-                                       '<div class="baz"/>',
-                               ],
-                               $removeTags,
-                       ],
-                       // don't flatten tags that start like chosen ones
-                       [
-                               '<div><s>foo</s> <span>bar</span></div>',
-                               'foo <span>bar</span>',
-                               [],
-                               $flattenSomeStuff,
-                       ],
-                       // total flattening
-                       [
-                               '<div style="foo">bar<sup>2</sup></div>',
-                               'bar2',
-                               [],
-                               $flattenEverything,
-                       ],
-                       // UTF-8 preservation and security
-                       [
-                               '<span title="&quot; \' &amp;">&lt;Тест!&gt;</span> &amp;&lt;&#38;&#0038;&#x26;&#x026;',
-                               '<span title="&quot; \' &amp;">&lt;Тест!&gt;</span> &amp;&lt;&amp;&amp;&amp;&amp;',
-                               [],
-                               $removeTags, // Have some rules to trigger a DOM parse
-                       ],
-                       // https://phabricator.wikimedia.org/T55086
-                       [
-                               'Foo<sup id="cite_ref-1" class="reference"><a href="#cite_note-1">[1]</a></sup>'
-                                       . ' <a href="/wiki/Bar" title="Bar" class="mw-redirect">Bar</a>',
-                               'Foo<sup id="cite_ref-1" class="reference"><a href="#cite_note-1">[1]</a></sup>'
-                                       . ' <a href="/wiki/Bar" title="Bar" class="mw-redirect">Bar</a>',
-                       ],
-               ];
-       }
-
-       public function testQuickProcessing() {
-               $f = new MockHtmlFormatter( 'foo' );
-               $f->filterContent();
-               $this->assertFalse( $f->hasDoc, 'HtmlFormatter should not needlessly parse HTML' );
-       }
-}
-
-class MockHtmlFormatter extends HtmlFormatter {
-       public $hasDoc = false;
-
-       public function getDoc() {
-               $this->hasDoc = true;
-               return parent::getDoc();
-       }
-}
author	Max Semenik <maxsem.wiki@gmail.com>
	Tue, 12 Apr 2016 20:20:20 +0000 (13:20 -0700)
committer	BryanDavis <bdavis@wikimedia.org>
	Wed, 13 Apr 2016 02:37:59 +0000 (02:37 +0000)
composer.json		patch \| blob \| history
includes/HtmlFormatter.php		patch \| blob \| history
includes/api/ApiHelp.php		patch \| blob \| history
tests/phpunit/includes/HtmlFormatterTest.php	[deleted file]	patch \| blob \| history