3 * Performs transformations of HTML by wrapping around libxml2 and working
4 * around its countless bugs.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
30 private $itemsToRemove = array();
31 private $elementsToFlatten = array();
32 protected $removeMedia = false;
37 * @param string $html: Text to process
39 public function __construct( $html ) {
44 * Turns a chunk of HTML into a proper document
48 public static function wrapHTML( $html ) {
49 return '<!doctype html><html><head></head><body>' . $html . '</body></html>';
53 * Override this in descendant class to modify HTML after it has been converted from DOM tree
54 * @param string $html: HTML to process
55 * @return string: Processed HTML
57 protected function onHtmlReady( $html ) {
62 * @return DOMDocument: DOM to manipulate
64 public function getDoc() {
66 $html = mb_convert_encoding( $this->html
, 'HTML-ENTITIES', 'UTF-8' );
68 // Workaround for bug that caused spaces before references
69 // to disappear during processing:
70 // https://bugzilla.wikimedia.org/show_bug.cgi?id=53086
72 // Please replace with a better fix if one can be found.
73 $html = str_replace( ' <', ' <', $html );
75 libxml_use_internal_errors( true );
76 $this->doc
= new DOMDocument();
77 $this->doc
->strictErrorChecking
= false;
78 $this->doc
->loadHTML( $html );
79 libxml_use_internal_errors( false );
80 $this->doc
->encoding
= 'UTF-8';
86 * Sets whether images/videos/sounds should be removed from output
89 public function setRemoveMedia( $flag = true ) {
90 $this->removeMedia
= $flag;
94 * Adds one or more selector of content to remove. A subset of CSS selector
95 * syntax is supported:
102 * @param Array|string $selectors: Selector(s) of stuff to remove
104 public function remove( $selectors ) {
105 $this->itemsToRemove
= array_merge( $this->itemsToRemove
, (array)$selectors );
109 * Adds one or more element name to the list to flatten (remove tag, but not its content)
110 * Can accept undelimited regexes
112 * Note this interface may fail in surprising unexpected ways due to usage of regexes,
113 * so should not be relied on for HTML markup security measures.
115 * @param Array|string $elements: Name(s) of tag(s) to flatten
117 public function flatten( $elements ) {
118 $this->elementsToFlatten
= array_merge( $this->elementsToFlatten
, (array)$elements );
122 * Instructs the formatter to flatten all tags
124 public function flattenAllTags() {
125 $this->flatten( '[?!]?[a-z0-9]+' );
129 * Removes content we've chosen to remove
131 public function filterContent() {
132 wfProfileIn( __METHOD__
);
133 $removals = $this->parseItemsToRemove();
136 wfProfileOut( __METHOD__
);
140 $doc = $this->getDoc();
144 // You can't remove DOMNodes from a DOMNodeList as you're iterating
145 // over them in a foreach loop. It will seemingly leave the internal
146 // iterator on the foreach out of wack and results will be quite
147 // strange. Though, making a queue of items to remove seems to work.
148 $domElemsToRemove = array();
149 foreach ( $removals['TAG'] as $tagToRemove ) {
150 $tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
151 foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
152 if ( $tagToRemoveNode ) {
153 $domElemsToRemove[] = $tagToRemoveNode;
158 $this->removeElements( $domElemsToRemove );
160 // Elements with named IDs
161 $domElemsToRemove = array();
162 foreach ( $removals['ID'] as $itemToRemove ) {
163 $itemToRemoveNode = $doc->getElementById( $itemToRemove );
164 if ( $itemToRemoveNode ) {
165 $domElemsToRemove[] = $itemToRemoveNode;
168 $this->removeElements( $domElemsToRemove );
171 $domElemsToRemove = array();
172 $xpath = new DOMXpath( $doc );
173 foreach ( $removals['CLASS'] as $classToRemove ) {
174 $elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
176 /** @var $element DOMElement */
177 foreach ( $elements as $element ) {
178 $classes = $element->getAttribute( 'class' );
179 if ( preg_match( "/\b$classToRemove\b/", $classes ) && $element->parentNode
) {
180 $domElemsToRemove[] = $element;
184 $this->removeElements( $domElemsToRemove );
186 // Tags with CSS Classes
187 foreach ( $removals['TAG_CLASS'] as $classToRemove ) {
188 $parts = explode( '.', $classToRemove );
190 $elements = $xpath->query(
191 '//' . $parts[0] . '[@class="' . $parts[1] . '"]'
194 $this->removeElements( $elements );
197 wfProfileOut( __METHOD__
);
201 * Removes a list of elelments from DOMDocument
202 * @param array|DOMNodeList $elements
204 private function removeElements( $elements ) {
206 if ( $elements instanceof DOMNodeList
) {
208 foreach ( $elements as $element ) {
212 /** @var $element DOMElement */
213 foreach ( $list as $element ) {
214 if ( $element->parentNode
) {
215 $element->parentNode
->removeChild( $element );
221 * libxml in its usual pointlessness converts many chars to entities - this function
222 * perfoms a reverse conversion
223 * @param string $html
226 private function fixLibXML( $html ) {
227 wfProfileIn( __METHOD__
);
228 static $replacements;
229 if ( ! $replacements ) {
230 // We don't include rules like '"' => '&quot;' because entities had already been
231 // normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
232 $replacements = new ReplacementArray( array(
233 '"' => '&quot;',
234 '&' => '&amp;',
235 '<' => '&lt;',
236 '>' => '&gt;',
239 $html = $replacements->replace( $html );
240 $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
241 wfProfileOut( __METHOD__
);
246 * Performs final transformations and returns resulting HTML
248 * @param DOMElement|string|null $element: ID of element to get HTML from or false to get it from the whole tree
249 * @return string: Processed HTML
251 public function getText( $element = null ) {
252 wfProfileIn( __METHOD__
);
255 if ( $element !== null && !( $element instanceof DOMElement
) ) {
256 $element = $this->doc
->getElementById( $element );
259 $body = $this->doc
->getElementsByTagName( 'body' )->item( 0 );
260 $nodesArray = array();
261 foreach ( $body->childNodes
as $node ) {
262 $nodesArray[] = $node;
264 foreach ( $nodesArray as $nodeArray ) {
265 $body->removeChild( $nodeArray );
267 $body->appendChild( $element );
269 $html = $this->doc
->saveHTML();
270 $html = $this->fixLibXml( $html );
274 if ( wfIsWindows() ) {
275 // Appears to be cleanup for CRLF misprocessing of unknown origin
276 // when running server on Windows platform.
278 // If this error continues in the future, please track it down in the
279 // XML code paths if possible and fix there.
280 $html = str_replace( ' ', '', $html );
282 $html = preg_replace( '/<!--.*?-->|^.*?<body>|<\/body>.*$/s', '', $html );
283 $html = $this->onHtmlReady( $html );
285 if ( $this->elementsToFlatten
) {
286 $elements = implode( '|', $this->elementsToFlatten
);
287 $html = preg_replace( "#</?($elements)\\b[^>]*>#is", '', $html );
290 wfProfileOut( __METHOD__
);
295 * @param $selector: CSS selector to parse
298 * @return bool: Whether the selector was successfully recognised
300 protected function parseSelector( $selector, &$type, &$rawName ) {
301 if ( strpos( $selector, '.' ) === 0 ) {
303 $rawName = substr( $selector, 1 );
304 } elseif ( strpos( $selector, '#' ) === 0 ) {
306 $rawName = substr( $selector, 1 );
307 } elseif ( strpos( $selector, '.' ) !== 0 &&
308 strpos( $selector, '.' ) !== false )
311 $rawName = $selector;
312 } elseif ( strpos( $selector, '[' ) === false
313 && strpos( $selector, ']' ) === false )
316 $rawName = $selector;
318 throw new MWException( __METHOD__
. "(): unrecognized selector '$selector'" );
325 * Transforms CSS selectors into an internal representation suitable for processing
328 protected function parseItemsToRemove() {
329 wfProfileIn( __METHOD__
);
334 'TAG_CLASS' => array(),
337 foreach ( $this->itemsToRemove
as $itemToRemove ) {
340 if ( $this->parseSelector( $itemToRemove, $type, $rawName ) ) {
341 $removals[$type][] = $rawName;
345 if ( $this->removeMedia
) {
346 $removals['TAG'][] = 'img';
347 $removals['TAG'][] = 'audio';
348 $removals['TAG'][] = 'video';
351 wfProfileOut( __METHOD__
);