Make HtmlFormatter return removed elements
authorNik Everett <neverett@wikimedia.org>
Fri, 18 Apr 2014 13:16:04 +0000 (09:16 -0400)
committerNik Everett <neverett@wikimedia.org>
Fri, 18 Apr 2014 13:23:03 +0000 (09:23 -0400)
This shouldn't cause much overhead and is useful for getting the text of
the removed tags.

Change-Id: I97cf66014719244b8bb2b0509b419c82202bdb01

includes/HtmlFormatter.php
tests/phpunit/includes/HtmlFormatterTest.php

index 7f590e5..96ffe9e 100644 (file)
@@ -128,7 +128,9 @@ class HtmlFormatter {
        }
 
        /**
-        * Removes content we've chosen to remove
+        * Removes content we've chosen to remove.  The text of the removed elements can be
+        * extracted with the getText method.
+        * @return array of removed DOMElements
         */
        public function filterContent() {
                wfProfileIn( __METHOD__ );
@@ -156,8 +158,7 @@ class HtmlFormatter {
                                }
                        }
                }
-
-               $this->removeElements( $domElemsToRemove );
+               $removed = $this->removeElements( $domElemsToRemove );
 
                // Elements with named IDs
                $domElemsToRemove = array();
@@ -167,7 +168,7 @@ class HtmlFormatter {
                                $domElemsToRemove[] = $itemToRemoveNode;
                        }
                }
-               $this->removeElements( $domElemsToRemove );
+               $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
 
                // CSS Classes
                $domElemsToRemove = array();
@@ -183,7 +184,7 @@ class HtmlFormatter {
                                }
                        }
                }
-               $this->removeElements( $domElemsToRemove );
+               $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
 
                // Tags with CSS Classes
                foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
@@ -192,16 +193,17 @@ class HtmlFormatter {
                        $elements = $xpath->query(
                                '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
                        );
-
-                       $this->removeElements( $elements );
+                       $removed = array_merge( $removed, $this->removeElements( $elements ) );
                }
 
                wfProfileOut( __METHOD__ );
+               return $removed;
        }
 
        /**
         * Removes a list of elelments from DOMDocument
         * @param array|DOMNodeList $elements
+        * @return array of removed elements
         */
        private function removeElements( $elements ) {
                $list = $elements;
@@ -217,6 +219,7 @@ class HtmlFormatter {
                                $element->parentNode->removeChild( $element );
                        }
                }
+               return $list;
        }
 
        /**
@@ -245,7 +248,10 @@ class HtmlFormatter {
        }
 
        /**
-        * Performs final transformations and returns resulting HTML
+        * Performs final transformations and returns resulting HTML.  Note that if you want to call this
+        * both without an element and with an element you should call it without an element first.  If you
+        * specify the $element in the method it'll change the underlying dom and you won't be able to get
+        * it back.
         *
         * @param DOMElement|string|null $element ID of element to get HTML from or false to get it from the whole tree
         * @return string Processed HTML
index 99a6efd..98eff7b 100644 (file)
@@ -8,17 +8,23 @@ class HtmlFormatterTest extends MediaWikiTestCase {
         * @dataProvider getHtmlData
         * @covers HtmlFormatter::getText
         */
-       public function testTransform( $input, $expected, $callback = false ) {
+       public function testTransform( $input, $expectedText, $expectedRemoved = array(), $callback = false ) {
                $input = self::normalize( $input );
                $formatter = new HtmlFormatter( HtmlFormatter::wrapHTML( $input ) );
                if ( $callback ) {
                        $callback( $formatter );
                }
-               $formatter->filterContent();
+               $removedElements = $formatter->filterContent();
                $html = $formatter->getText();
+               $removed = array();
+               foreach ( $removedElements as $removedElement ) {
+                       $removed[] = self::normalize( $formatter->getText( $removedElement ) );
+               }
+               $expectedRemoved = array_map( 'self::normalize', $expectedRemoved );
 
                $this->assertValidHtmlSnippet( $html );
-               $this->assertEquals( self::normalize( $expected ), self::normalize( $html ) );
+               $this->assertEquals( self::normalize( $expectedText ), self::normalize( $html ) );
+               $this->assertEquals( asort( $expectedRemoved ), asort( $removed ) );
        }
 
        private static function normalize( $s ) {
@@ -45,6 +51,7 @@ class HtmlFormatterTest extends MediaWikiTestCase {
                        array(
                                '<img src="/foo/bar.jpg" alt="Blah"/>',
                                '',
+                               array( '<img src="/foo/bar.jpg" alt="Blah">' ),
                                $removeImages,
                        ),
                        // basic tag removal
@@ -52,21 +59,30 @@ class HtmlFormatterTest extends MediaWikiTestCase {
                                '<table><tr><td>foo</td></tr></table><div class="foo">foo</div><div class="foo quux">foo</div><span id="bar">bar</span>
 <strong class="foo" id="bar">foobar</strong><div class="notfoo">test</div><div class="baz"/>
 <span class="baz">baz</span>',
-
                                '<div class="notfoo">test</div>
 <span class="baz">baz</span>',
+                               array(
+                                       '<table><tr><td>foo</td></tr></table>',
+                                       '<div class="foo">foo</div>',
+                                       '<div class="foo quux">foo</div>',
+                                       '<span id="bar">bar</span>',
+                                       '<strong class="foo" id="bar">foobar</strong>',
+                                       '<div class="baz"/>',
+                               ),
                                $removeTags,
                        ),
                        // don't flatten tags that start like chosen ones
                        array(
                                '<div><s>foo</s> <span>bar</span></div>',
                                'foo <span>bar</span>',
+                               array(),
                                $flattenSomeStuff,
                        ),
                        // total flattening
                        array(
                                '<div style="foo">bar<sup>2</sup></div>',
                                'bar2',
+                               array(),
                                $flattenEverything,
                        ),
                        // UTF-8 preservation and security