private $doc;
private $html;
- private $itemsToRemove = array();
- private $elementsToFlatten = array();
+ private $itemsToRemove = [];
+ private $elementsToFlatten = [];
protected $removeMedia = false;
/**
*/
public function getDoc() {
if ( !$this->doc ) {
- // DOMDocument::loadHTML apparently isn't very good with encodings, so
+ // DOMDocument::loadHTML isn't very good with encodings, so
// convert input to ASCII by encoding everything above 128 as entities.
- if ( function_exists( 'mb_convert_encoding' ) ) {
- $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
- } else {
- $html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) {
- return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
- }, $this->html );
- }
+ $html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
// Workaround for bug that caused spaces before references
// to disappear during processing: https://phabricator.wikimedia.org/T55086
},
true
) ) {
- return array();
+ return [];
}
$doc = $this->getDoc();
// over them in a foreach loop. It will seemingly leave the internal
// iterator on the foreach out of wack and results will be quite
// strange. Though, making a queue of items to remove seems to work.
- $domElemsToRemove = array();
+ $domElemsToRemove = [];
foreach ( $removals['TAG'] as $tagToRemove ) {
$tagToRemoveNodes = $doc->getElementsByTagName( $tagToRemove );
foreach ( $tagToRemoveNodes as $tagToRemoveNode ) {
$removed = $this->removeElements( $domElemsToRemove );
// Elements with named IDs
- $domElemsToRemove = array();
+ $domElemsToRemove = [];
foreach ( $removals['ID'] as $itemToRemove ) {
$itemToRemoveNode = $doc->getElementById( $itemToRemove );
if ( $itemToRemoveNode ) {
$removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) );
// CSS Classes
- $domElemsToRemove = array();
+ $domElemsToRemove = [];
$xpath = new DOMXPath( $doc );
foreach ( $removals['CLASS'] as $classToRemove ) {
$elements = $xpath->query( '//*[contains(@class, "' . $classToRemove . '")]' );
private function removeElements( $elements ) {
$list = $elements;
if ( $elements instanceof DOMNodeList ) {
- $list = array();
+ $list = [];
foreach ( $elements as $element ) {
$list[] = $element;
}
if ( !$replacements ) {
// We don't include rules like '"' => '"' because entities had already been
// normalized by libxml. Using this function with input not sanitized by libxml is UNSAFE!
- $replacements = new ReplacementArray( array(
+ $replacements = new ReplacementArray( [
'"' => '"',
'&' => '&',
'<' => '&lt;',
'>' => '&gt;',
- ) );
+ ] );
}
$html = $replacements->replace( $html );
- if ( function_exists( 'mb_convert_encoding' ) ) {
- // Just in case the conversion in getDoc() above used named
- // entities that aren't known to html_entity_decode().
- $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
- } else {
- $html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' );
- }
+ // Just in case the conversion in getDoc() above used named
+ // entities that aren't known to html_entity_decode().
+ $html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
+
return $html;
}
}
if ( $element ) {
$body = $this->doc->getElementsByTagName( 'body' )->item( 0 );
- $nodesArray = array();
+ $nodesArray = [];
foreach ( $body->childNodes as $node ) {
$nodesArray[] = $node;
}
}
$html = $this->doc->saveHTML();
- $html = $this->fixLibXml( $html );
+ $html = $this->fixLibXML( $html );
if ( wfIsWindows() ) {
// Cleanup for CRLF misprocessing of unknown origin on Windows.
// If this error continues in the future, please track it down in the
* @return array
*/
protected function parseItemsToRemove() {
- $removals = array(
- 'ID' => array(),
- 'TAG' => array(),
- 'CLASS' => array(),
- 'TAG_CLASS' => array(),
- );
+ $removals = [
+ 'ID' => [],
+ 'TAG' => [],
+ 'CLASS' => [],
+ 'TAG_CLASS' => [],
+ ];
foreach ( $this->itemsToRemove as $itemToRemove ) {
$type = '';