* Fields in ParserOptions are now private. Use the accessors instead.
* Custom LESS functions (defined via $wgResourceLoaderLESSFunctions)
have been removed, after being deprecated in 1.24.
+* $wgAlwaysUseTidy has been removed.
=== New features in 1.26 ===
* (T51506) Now action=info gives estimates of actual watchers for a page.
'MWOldPassword' => __DIR__ . '/includes/password/MWOldPassword.php',
'MWSaltedPassword' => __DIR__ . '/includes/password/MWSaltedPassword.php',
'MWTidy' => __DIR__ . '/includes/parser/MWTidy.php',
- 'MWTidyWrapper' => __DIR__ . '/includes/parser/MWTidy.php',
'MWTimestamp' => __DIR__ . '/includes/MWTimestamp.php',
'MachineReadableRCFeedFormatter' => __DIR__ . '/includes/rcfeed/MachineReadableRCFeedFormatter.php',
'MagicWord' => __DIR__ . '/includes/MagicWord.php',
'MediaWiki\\Logger\\Monolog\\WikiProcessor' => __DIR__ . '/includes/debug/logger/monolog/WikiProcessor.php',
'MediaWiki\\Logger\\NullSpi' => __DIR__ . '/includes/debug/logger/NullSpi.php',
'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php',
+ 'MediaWiki\\Tidy\\RaggettBase' => __DIR__ . '/includes/tidy/RaggettBase.php',
+ 'MediaWiki\\Tidy\\RaggettExternal' => __DIR__ . '/includes/tidy/RaggettExternal.php',
+ 'MediaWiki\\Tidy\\RaggettInternalHHVM' => __DIR__ . '/includes/tidy/RaggettInternalHHVM.php',
+ 'MediaWiki\\Tidy\\RaggettInternalPHP' => __DIR__ . '/includes/tidy/RaggettInternalPHP.php',
+ 'MediaWiki\\Tidy\\RaggettWrapper' => __DIR__ . '/includes/tidy/RaggettWrapper.php',
+ 'MediaWiki\\Tidy\\TidyDriverBase' => __DIR__ . '/includes/tidy/TidyDriverBase.php',
'MediaWiki\\Widget\\ComplexNamespaceInputWidget' => __DIR__ . '/includes/widget/ComplexNamespaceInputWidget.php',
'MediaWiki\\Widget\\NamespaceInputWidget' => __DIR__ . '/includes/widget/NamespaceInputWidget.php',
'MediaWiki\\Widget\\TitleInputWidget' => __DIR__ . '/includes/widget/TitleInputWidget.php',
$wgAllowImageTag = false;
/**
- * $wgUseTidy: use tidy to make sure HTML output is sane.
- * Tidy is a free tool that fixes broken HTML.
- * See http://www.w3.org/People/Raggett/tidy/
- *
- * - $wgTidyBin should be set to the path of the binary and
- * - $wgTidyConf to the path of the configuration file.
- * - $wgTidyOpts can include any number of parameters.
- * - $wgTidyInternal controls the use of the PECL extension or the
- * libtidy (PHP >= 5) extension to use an in-process tidy library instead
- * of spawning a separate program.
- * Normally you shouldn't need to override the setting except for
- * debugging. To install, use 'pear install tidy' and add a line
- * 'extension=tidy.so' to php.ini.
+ * Configuration for HTML postprocessing tool. Set this to a configuration
+ * array to enable an external tool. Dave Raggett's "HTML Tidy" is typically
+ * used. See http://www.w3.org/People/Raggett/tidy/
+ *
+ * If this is null and $wgUseTidy is true, the deprecated configuration
+ * parameters will be used instead.
+ *
+ * If this is null and $wgUseTidy is false, a pure PHP fallback will be used.
+ *
+ * Keys are:
+ * - driver: May be:
+ * - RaggettInternalHHVM: Use the limited-functionality HHVM extension
+ * - RaggettInternalPHP: Use the PECL extension
+ * - RaggettExternal: Shell out to an external binary (tidyBin)
+ *
+ * - tidyConfigFile: Path to configuration file for any of the Raggett drivers
+ * - debugComment: True to add a comment to the output with warning messages
+ * - tidyBin: For RaggettExternal, the path to the tidy binary.
+ * - tidyCommandLine: For RaggettExternal, additional command line options.
*/
-$wgUseTidy = false;
+$wgTidyConfig = null;
/**
- * @see $wgUseTidy
+ * Set this to true to use the deprecated tidy configuration parameters.
+ * @deprecated use $wgTidyConfig
*/
-$wgAlwaysUseTidy = false;
+$wgUseTidy = false;
/**
- * @see $wgUseTidy
+ * The path to the tidy binary.
+ * @deprecated Use $wgTidyConfig['tidyBin']
*/
$wgTidyBin = 'tidy';
/**
- * @see $wgUseTidy
+ * The path to the tidy config file
+ * @deprecated Use $wgTidyConfig['tidyConfigFile']
*/
-$wgTidyConf = $IP . '/includes/tidy.conf';
+$wgTidyConf = $IP . '/includes/tidy/tidy.conf';
/**
- * @see $wgUseTidy
+ * The command line options to the tidy binary
+ * @deprecated Use $wgTidyConfig['tidyCommandLine']
*/
$wgTidyOpts = '';
/**
- * @see $wgUseTidy
+ * Set this to true to use the tidy extension
+ * @deprecated Use $wgTidyConfig['driver']
*/
$wgTidyInternal = extension_loaded( 'tidy' );
public static function removeHTMLtags( $text, $processCallback = null,
$args = array(), $extratags = array(), $removetags = array()
) {
- global $wgUseTidy;
-
extract( self::getRecognizedTagData( $extratags, $removetags ) );
# Remove HTML comments
$text = Sanitizer::removeHTMLcomments( $text );
$bits = explode( '<', $text );
$text = str_replace( '>', '>', array_shift( $bits ) );
- if ( !$wgUseTidy ) {
+ if ( !MWTidy::isEnabled() ) {
$tagstack = $tablestack = array();
foreach ( $bits as $x ) {
$regs = array();
* @ingroup Parser
*/
-/**
- * Class used to hide mw:editsection tokens from Tidy so that it doesn't break them
- * or break on them. This is a bit of a hack for now, but hopefully in the future
- * we may create a real postprocessor or something that will replace this.
- * It's called wrapper because for now it basically takes over MWTidy::tidy's task
- * of wrapping the text in a xhtml block
- *
- * This re-uses some of the parser's UNIQ tricks, though some of it is private so it's
- * duplicated. Perhaps we should create an abstract marker hiding class.
- *
- * @ingroup Parser
- */
-class MWTidyWrapper {
-
- /**
- * @var ReplacementArray
- */
- protected $mTokens;
-
- protected $mMarkerIndex;
-
- public function __construct() {
- $this->mTokens = null;
- }
-
- /**
- * @param string $text
- * @return string
- */
- public function getWrapped( $text ) {
- $this->mTokens = new ReplacementArray;
- $this->mMarkerIndex = 0;
-
- // Replace <mw:editsection> elements with placeholders
- $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX,
- array( &$this, 'replaceCallback' ), $text );
- // ...and <mw:toc> markers
- $wrappedtext = preg_replace_callback( '/\<\\/?mw:toc\>/',
- array( &$this, 'replaceCallback' ), $wrappedtext );
- // ... and <math> tags
- $wrappedtext = preg_replace_callback( '/\<math(.*?)\<\\/math\>/s',
- array( &$this, 'replaceCallback' ), $wrappedtext );
- // Modify inline Microdata <link> and <meta> elements so they say <html-link> and <html-meta> so
- // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config
- $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '<html-$1$2$3', $wrappedtext );
-
- // Wrap the whole thing in a doctype and body for Tidy.
- $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' .
- ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>' .
- '<head><title>test</title></head><body>' . $wrappedtext . '</body></html>';
-
- return $wrappedtext;
- }
-
- /**
- * @param array $m
- *
- * @return string
- */
- public function replaceCallback( $m ) {
- $marker = Parser::MARKER_PREFIX . "-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX;
- $this->mMarkerIndex++;
- $this->mTokens->setPair( $marker, $m[0] );
- return $marker;
- }
-
- /**
- * @param string $text
- * @return string
- */
- public function postprocess( $text ) {
- // Revert <html-{link,meta}> back to <{link,meta}>
- $text = preg_replace( '!<html-(link|meta)([^>]*?)(/{0,1}>)!', '<$1$2$3', $text );
-
- // Restore the contents of placeholder tokens
- $text = $this->mTokens->replace( $text );
-
- return $text;
- }
-
-}
-
/**
* Class to interact with HTML tidy
*
* @ingroup Parser
*/
class MWTidy {
+ private static $instance;
+
/**
- * Interface with html tidy, used if $wgUseTidy = true.
+ * Interface with html tidy.
* If tidy isn't able to correct the markup, the original will be
* returned in all its glory with a warning comment appended.
*
* @return string Corrected HTML output
*/
public static function tidy( $text ) {
- $wrapper = new MWTidyWrapper;
- $wrappedtext = $wrapper->getWrapped( $text );
-
- $retVal = null;
- $correctedtext = self::clean( $wrappedtext, false, $retVal );
-
- if ( $retVal < 0 ) {
- wfDebug( "Possible tidy configuration error!\n" );
- return $text . "\n<!-- Tidy was unable to run -->\n";
- } elseif ( is_null( $correctedtext ) ) {
- wfDebug( "Tidy error detected!\n" );
- return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
+ $driver = self::singleton();
+ if ( !$driver ) {
+ throw new MWException( __METHOD__.
+ ': tidy is disabled, caller should have checked MWTidy::isEnabled()' );
}
-
- $correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens
-
- return $correctedtext;
+ return $driver->tidy( $text );
}
/**
* @return bool Whether the HTML is valid
*/
public static function checkErrors( $text, &$errorStr = null ) {
- $retval = 0;
- $errorStr = self::clean( $text, true, $retval );
- return ( $retval < 0 && $errorStr == '' ) || $retval == 0;
+ $driver = self::singleton();
+ if ( !$driver ) {
+ throw new MWException( __METHOD__.
+ ': tidy is disabled, caller should have checked MWTidy::isEnabled()' );
+ }
+ if ( $driver->supportsValidate() ) {
+ return $driver->validate( $text, $errorStr );
+ } else {
+ throw new MWException( __METHOD__ . ": error text return from HHVM tidy is not supported" );
+ }
}
- /**
- * Perform a clean/repair operation
- * @param string $text HTML to check
- * @param bool $stderr Whether to read result from STDERR rather than STDOUT
- * @param int &$retval Exit code (-1 on internal error)
- * @return null|string
- * @throws MWException
- */
- private static function clean( $text, $stderr = false, &$retval = null ) {
- global $wgTidyInternal;
+ public static function isEnabled() {
+ return self::singleton() !== false;
+ }
- if ( $wgTidyInternal ) {
- if ( wfIsHHVM() ) {
- if ( $stderr ) {
- throw new MWException( __METHOD__ . ": error text return from HHVM tidy is not supported" );
+ protected static function singleton() {
+ global $wgUseTidy, $wgTidyInternal, $wgTidyConf, $wgDebugTidy, $wgTidyConfig,
+ $wgTidyBin, $wgTidyOpts;
+
+ if ( self::$instance === null ) {
+ if ( $wgTidyConfig !== null ) {
+ $config = $wgTidyConfig;
+ } elseif ( $wgUseTidy ) {
+ // b/c configuration
+ $config = array(
+ 'tidyConfigFile' => $wgTidyConf,
+ 'debugComment' => $wgDebugTidy,
+ 'tidyBin' => $wgTidyBin,
+ 'tidyCommandLine' => $wgTidyOpts );
+ if ( $wgTidyInternal ) {
+ if ( wfIsHHVM() ) {
+ $config['driver'] = 'RaggettInternalHHVM';
+ } else {
+ $config['driver'] = 'RaggettInternalPHP';
+ }
+ } else {
+ $config['driver'] = 'RaggettExternal';
}
- return self::hhvmClean( $text, $retval );
} else {
- return self::phpClean( $text, $stderr, $retval );
+ return false;
}
- } else {
- return self::externalClean( $text, $stderr, $retval );
- }
- }
-
- /**
- * Spawn an external HTML tidy process and get corrected markup back from it.
- * Also called in OutputHandler.php for full page validation
- *
- * @param string $text HTML to check
- * @param bool $stderr Whether to read result from STDERR rather than STDOUT
- * @param int &$retval Exit code (-1 on internal error)
- * @return string|null
- */
- private static function externalClean( $text, $stderr = false, &$retval = null ) {
- global $wgTidyConf, $wgTidyBin, $wgTidyOpts;
-
- $cleansource = '';
- $opts = ' -utf8';
-
- if ( $stderr ) {
- $descriptorspec = array(
- 0 => array( 'pipe', 'r' ),
- 1 => array( 'file', wfGetNull(), 'a' ),
- 2 => array( 'pipe', 'w' )
- );
- } else {
- $descriptorspec = array(
- 0 => array( 'pipe', 'r' ),
- 1 => array( 'pipe', 'w' ),
- 2 => array( 'file', wfGetNull(), 'a' )
- );
- }
-
- $readpipe = $stderr ? 2 : 1;
- $pipes = array();
-
- $process = proc_open(
- "$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes );
-
- //NOTE: At least on linux, the process will be created even if tidy is not installed.
- // This means that missing tidy will be treated as a validation failure.
-
- if ( is_resource( $process ) ) {
- // Theoretically, this style of communication could cause a deadlock
- // here. If the stdout buffer fills up, then writes to stdin could
- // block. This doesn't appear to happen with tidy, because tidy only
- // writes to stdout after it's finished reading from stdin. Search
- // for tidyParseStdin and tidySaveStdout in console/tidy.c
- fwrite( $pipes[0], $text );
- fclose( $pipes[0] );
- while ( !feof( $pipes[$readpipe] ) ) {
- $cleansource .= fgets( $pipes[$readpipe], 1024 );
+ switch ( $config['driver'] ) {
+ case 'RaggettInternalHHVM':
+ self::$instance = new MediaWiki\Tidy\RaggettInternalHHVM( $config );
+ break;
+ case 'RaggettInternalPHP':
+ self::$instance = new MediaWiki\Tidy\RaggettInternalPHP( $config );
+ break;
+ case 'RaggettExternal':
+ self::$instance = new MediaWiki\Tidy\RaggettExternal( $config );
+ break;
+ default:
+ throw new MWException( "Invalid tidy driver: \"{$config['driver']}\"" );
}
- fclose( $pipes[$readpipe] );
- $retval = proc_close( $process );
- } else {
- wfWarn( "Unable to start external tidy process" );
- $retval = -1;
}
-
- if ( !$stderr && $cleansource == '' && $text != '' ) {
- // Some kind of error happened, so we couldn't get the corrected text.
- // Just give up; we'll use the source text and append a warning.
- $cleansource = null;
- }
-
- return $cleansource;
+ return self::$instance;
}
/**
- * Use the HTML tidy extension to use the tidy library in-process,
- * saving the overhead of spawning a new process.
- *
- * @param string $text HTML to check
- * @param bool $stderr Whether to read result from error status instead of output
- * @param int &$retval Exit code (-1 on internal error)
- * @return string|null
+ * Set the driver to be used. This is for testing.
+ * @param TidyDriverBase|false|null $instance
*/
- private static function phpClean( $text, $stderr = false, &$retval = null ) {
- global $wgTidyConf, $wgDebugTidy;
-
- if ( ( !wfIsHHVM() && !class_exists( 'tidy' ) ) ||
- ( wfIsHHVM() && !function_exists( 'tidy_repair_string' ) )
- ) {
- wfWarn( "Unable to load internal tidy class." );
- $retval = -1;
-
- return null;
- }
-
- $tidy = new tidy;
- $tidy->parseString( $text, $wgTidyConf, 'utf8' );
-
- if ( $stderr ) {
- $retval = $tidy->getStatus();
- return $tidy->errorBuffer;
- }
-
- $tidy->cleanRepair();
- $retval = $tidy->getStatus();
- if ( $retval == 2 ) {
- // 2 is magic number for fatal error
- // http://www.php.net/manual/en/function.tidy-get-status.php
- $cleansource = null;
- } else {
- $cleansource = tidy_get_output( $tidy );
- if ( $wgDebugTidy && $retval > 0 ) {
- $cleansource .= "<!--\nTidy reports:\n" .
- str_replace( '-->', '-->', $tidy->errorBuffer ) .
- "\n-->";
- }
- }
-
- return $cleansource;
+ public static function setInstance( $instance ) {
+ self::$instance = $instance;
}
/**
- * Use the tidy extension for HHVM from
- * https://github.com/wikimedia/mediawiki-php-tidy
- *
- * This currently does not support the object-oriented interface, but
- * tidy_repair_string() can be used for the most common tasks.
- *
- * @param string $text HTML to check
- * @param int &$retval Exit code (-1 on internal error)
- * @return string|null
+ * Destroy the current singleton instance
*/
- private static function hhvmClean( $text, &$retval ) {
- global $wgTidyConf;
-
- $cleansource = tidy_repair_string( $text, $wgTidyConf, 'utf8' );
- if ( $cleansource === false ) {
- $cleansource = null;
- $retval = -1;
- } else {
- $retval = 0;
- }
-
- return $cleansource;
+ public static function destroySingleton() {
+ self::$instance = null;
}
}
* @return string
*/
private function internalParseHalfParsed( $text, $isMain = true, $linestart = true ) {
- global $wgUseTidy, $wgAlwaysUseTidy;
-
$text = $this->mStripState->unstripGeneral( $text );
if ( $isMain ) {
$text = Sanitizer::normalizeCharReferences( $text );
- if ( ( $wgUseTidy && $this->mOptions->getTidy() ) || $wgAlwaysUseTidy ) {
+ if ( MWTidy::isEnabled() && $this->mOptions->getTidy() ) {
$text = MWTidy::tidy( $text );
} else {
# attempt to sanitize at least some nesting problems
+++ /dev/null
-# html tidy (http://tidy.sf.net) configuration
-# tidy - validate, correct, and pretty-print HTML files
-# see: man 1 tidy, http://tidy.sourceforge.net/docs/quickref.html
-
-show-body-only: yes
-force-output: yes
-tidy-mark: no
-wrap: 0
-wrap-attributes: no
-literal-attributes: yes
-output-xhtml: yes
-numeric-entities: yes
-enclose-text: yes
-enclose-block-text: yes
-quiet: yes
-quote-nbsp: yes
-fix-backslash: no
-fix-uri: no
-# Don't strip html5 elements we support
-# html-{meta,link} is a hack we use to prevent Tidy from stripping <meta> and <link> used in the body for Microdata
-new-empty-tags: html-meta, html-link, wbr
-new-inline-tags: video, audio, source, track, bdi, data, time, mark
--- /dev/null
+<?php
+
+namespace MediaWiki\Tidy;
+
+abstract class RaggettBase extends TidyDriverBase {
+ /**
+ * Generic interface for wrapping and unwrapping HTML for Dave Raggett's tidy.
+ *
+ * @param string $text Hideous HTML input
+ * @return string Corrected HTML output
+ */
+ public function tidy( $text ) {
+ $wrapper = new RaggettWrapper;
+ $wrappedtext = $wrapper->getWrapped( $text );
+
+ $retVal = null;
+ $correctedtext = $this->cleanWrapped( $wrappedtext, false, $retVal );
+
+ if ( $retVal < 0 ) {
+ wfDebug( "Possible tidy configuration error!\n" );
+ return $text . "\n<!-- Tidy was unable to run -->\n";
+ } elseif ( is_null( $correctedtext ) ) {
+ wfDebug( "Tidy error detected!\n" );
+ return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
+ }
+
+ $correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens
+
+ return $correctedtext;
+ }
+
+ public function validate( $text, &$errorStr ) {
+ $retval = 0;
+ $errorStr = $this->cleanWrapped( $text, true, $retval );
+ return ( $retval < 0 && $errorStr == '' ) || $retval == 0;
+ }
+
+ /**
+ * Perform a clean/repair operation
+ * @param string $text HTML to check
+ * @param bool $stderr Whether to read result from STDERR rather than STDOUT
+ * @param int &$retval Exit code (-1 on internal error)
+ * @return null|string
+ * @throws MWException
+ */
+ abstract protected function cleanWrapped( $text, $stderr = false, &$retval = null );
+}
--- /dev/null
+<?php
+
+namespace MediaWiki\Tidy;
+
+class RaggettExternal extends RaggettBase {
+ /**
+ * Spawn an external HTML tidy process and get corrected markup back from it.
+ * Also called in OutputHandler.php for full page validation
+ *
+ * @param string $text HTML to check
+ * @param bool $stderr Whether to read result from STDERR rather than STDOUT
+ * @param int &$retval Exit code (-1 on internal error)
+ * @return string|null
+ */
+ protected function cleanWrapped( $text, $stderr = false, &$retval = null ) {
+ $cleansource = '';
+ $opts = ' -utf8';
+
+ if ( $stderr ) {
+ $descriptorspec = array(
+ 0 => array( 'pipe', 'r' ),
+ 1 => array( 'file', wfGetNull(), 'a' ),
+ 2 => array( 'pipe', 'w' )
+ );
+ } else {
+ $descriptorspec = array(
+ 0 => array( 'pipe', 'r' ),
+ 1 => array( 'pipe', 'w' ),
+ 2 => array( 'file', wfGetNull(), 'a' )
+ );
+ }
+
+ $readpipe = $stderr ? 2 : 1;
+ $pipes = array();
+
+ $process = proc_open(
+ "{$this->config['tidyBin']} -config {$this->config['tidyConfigFile']} " .
+ $this->config['tidyCommandLine'] . $opts, $descriptorspec, $pipes );
+
+ //NOTE: At least on linux, the process will be created even if tidy is not installed.
+ // This means that missing tidy will be treated as a validation failure.
+
+ if ( is_resource( $process ) ) {
+ // Theoretically, this style of communication could cause a deadlock
+ // here. If the stdout buffer fills up, then writes to stdin could
+ // block. This doesn't appear to happen with tidy, because tidy only
+ // writes to stdout after it's finished reading from stdin. Search
+ // for tidyParseStdin and tidySaveStdout in console/tidy.c
+ fwrite( $pipes[0], $text );
+ fclose( $pipes[0] );
+ while ( !feof( $pipes[$readpipe] ) ) {
+ $cleansource .= fgets( $pipes[$readpipe], 1024 );
+ }
+ fclose( $pipes[$readpipe] );
+ $retval = proc_close( $process );
+ } else {
+ wfWarn( "Unable to start external tidy process" );
+ $retval = -1;
+ }
+
+ if ( !$stderr && $cleansource == '' && $text != '' ) {
+ // Some kind of error happened, so we couldn't get the corrected text.
+ // Just give up; we'll use the source text and append a warning.
+ $cleansource = null;
+ }
+
+ return $cleansource;
+ }
+
+ public function supportsValidate() {
+ return true;
+ }
+}
--- /dev/null
+<?php
+
+namespace MediaWiki\Tidy;
+
+class RaggettInternalHHVM extends RaggettBase {
+ /**
+ * Use the HTML tidy extension to use the tidy library in-process,
+ * saving the overhead of spawning a new process.
+ *
+ * @param string $text HTML to check
+ * @param bool $stderr Whether to read result from error status instead of output
+ * @param int &$retval Exit code (-1 on internal error)
+ * @return string|null
+ */
+ protected function cleanWrapped( $text, $stderr = false, &$retval ) {
+ if ( $stderr ) {
+ throw new Exception( "\$stderr cannot be used with RaggettInternalHHVM" );
+ }
+ $cleansource = tidy_repair_string( $text, $this->config['tidyConfigFile'], 'utf8' );
+ if ( $cleansource === false ) {
+ $cleansource = null;
+ $retval = -1;
+ } else {
+ $retval = 0;
+ }
+
+ return $cleansource;
+ }
+}
--- /dev/null
+<?php
+
+namespace MediaWiki\Tidy;
+
+class RaggettInternalPHP extends RaggettBase {
+ /**
+ * Use the HTML tidy extension to use the tidy library in-process,
+ * saving the overhead of spawning a new process.
+ *
+ * @param string $text HTML to check
+ * @param bool $stderr Whether to read result from error status instead of output
+ * @param int &$retval Exit code (-1 on internal error)
+ * @return string|null
+ */
+ protected function cleanWrapped( $text, $stderr = false, &$retval = null ) {
+ if ( !class_exists( 'tidy' ) ) {
+ wfWarn( "Unable to load internal tidy class." );
+ $retval = -1;
+
+ return null;
+ }
+
+ $tidy = new \tidy;
+ $tidy->parseString( $text, $this->config['tidyConfigFile'], 'utf8' );
+
+ if ( $stderr ) {
+ $retval = $tidy->getStatus();
+ return $tidy->errorBuffer;
+ }
+
+ $tidy->cleanRepair();
+ $retval = $tidy->getStatus();
+ if ( $retval == 2 ) {
+ // 2 is magic number for fatal error
+ // http://www.php.net/manual/en/function.tidy-get-status.php
+ $cleansource = null;
+ } else {
+ $cleansource = tidy_get_output( $tidy );
+ if ( !empty( $this->config['debugComment'] ) && $retval > 0 ) {
+ $cleansource .= "<!--\nTidy reports:\n" .
+ str_replace( '-->', '-->', $tidy->errorBuffer ) .
+ "\n-->";
+ }
+ }
+
+ return $cleansource;
+ }
+
+ public function supportsValidate() {
+ return true;
+ }
+}
--- /dev/null
+<?php
+namespace MediaWiki\Tidy;
+
+use ReplacementArray;
+use ParserOutput;
+use Parser;
+
+/**
+ * Class used to hide mw:editsection tokens from Tidy so that it doesn't break them
+ * or break on them. This is a bit of a hack for now, but hopefully in the future
+ * we may create a real postprocessor or something that will replace this.
+ * It's called wrapper because for now it basically takes over MWTidy::tidy's task
+ * of wrapping the text in a xhtml block
+ *
+ * This re-uses some of the parser's UNIQ tricks, though some of it is private so it's
+ * duplicated. Perhaps we should create an abstract marker hiding class.
+ *
+ * @ingroup Parser
+ */
+class RaggettWrapper {
+
+ /**
+ * @var ReplacementArray
+ */
+ protected $mTokens;
+
+ protected $mMarkerIndex;
+
+ public function __construct() {
+ $this->mTokens = null;
+ }
+
+ /**
+ * @param string $text
+ * @return string
+ */
+ public function getWrapped( $text ) {
+ $this->mTokens = new ReplacementArray;
+ $this->mMarkerIndex = 0;
+
+ // Replace <mw:editsection> elements with placeholders
+ $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX,
+ array( &$this, 'replaceCallback' ), $text );
+ // ...and <mw:toc> markers
+ $wrappedtext = preg_replace_callback( '/\<\\/?mw:toc\>/',
+ array( &$this, 'replaceCallback' ), $wrappedtext );
+ // ... and <math> tags
+ $wrappedtext = preg_replace_callback( '/\<math(.*?)\<\\/math\>/s',
+ array( &$this, 'replaceCallback' ), $wrappedtext );
+ // Modify inline Microdata <link> and <meta> elements so they say <html-link> and <html-meta> so
+ // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config
+ $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '<html-$1$2$3', $wrappedtext );
+
+ // Wrap the whole thing in a doctype and body for Tidy.
+ $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' .
+ ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>' .
+ '<head><title>test</title></head><body>' . $wrappedtext . '</body></html>';
+
+ return $wrappedtext;
+ }
+
+ /**
+ * @param array $m
+ *
+ * @return string
+ */
+ public function replaceCallback( $m ) {
+ $marker = Parser::MARKER_PREFIX . "-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX;
+ $this->mMarkerIndex++;
+ $this->mTokens->setPair( $marker, $m[0] );
+ return $marker;
+ }
+
+ /**
+ * @param string $text
+ * @return string
+ */
+ public function postprocess( $text ) {
+ // Revert <html-{link,meta}> back to <{link,meta}>
+ $text = preg_replace( '!<html-(link|meta)([^>]*?)(/{0,1}>)!', '<$1$2$3', $text );
+
+ // Restore the contents of placeholder tokens
+ $text = $this->mTokens->replace( $text );
+
+ return $text;
+ }
+
+}
+?>
--- /dev/null
+<?php
+
+namespace MediaWiki\Tidy;
+
+/**
+ * Base class for HTML cleanup utilities
+ */
+abstract class TidyDriverBase {
+ protected $config;
+
+ function __construct( $config ) {
+ $this->config = $config;
+ }
+
+ /**
+ * Return true if validate() can be used
+ */
+ public function supportsValidate() {
+ return false;
+ }
+
+ /**
+ * Check HTML for errors, used if $wgValidateAllHtml = true.
+ *
+ * @param string $text
+ * @param string &$errorStr Return the error string
+ * @return bool Whether the HTML is valid
+ */
+ public function validate( $text, &$errorStr ) {
+ throw new MWException( get_class( $this ) . " does not support validate()" );
+ }
+
+ /**
+ * Clean up HTML
+ *
+ * @param string HTML document fragment to clean up
+ * @param string The corrected HTML output
+ */
+ public abstract function tidy( $text );
+}
--- /dev/null
+# html tidy (http://tidy.sf.net) configuration
+# tidy - validate, correct, and pretty-print HTML files
+# see: man 1 tidy, http://tidy.sourceforge.net/docs/quickref.html
+
+show-body-only: yes
+force-output: yes
+tidy-mark: no
+wrap: 0
+wrap-attributes: no
+literal-attributes: yes
+output-xhtml: yes
+numeric-entities: yes
+enclose-text: yes
+enclose-block-text: yes
+quiet: yes
+quote-nbsp: yes
+fix-backslash: no
+fix-uri: no
+# Don't strip html5 elements we support
+# html-{meta,link} is a hack we use to prevent Tidy from stripping <meta> and <link> used in the body for Microdata
+new-empty-tags: html-meta, html-link, wbr
+new-inline-tags: video, audio, source, track, bdi, data, time, mark
private function doRefreshLinks( $start, $newOnly = false,
$end = null, $redirectsOnly = false, $oldRedirectsOnly = false
) {
- global $wgParser, $wgUseTidy;
+ global $wgParser;
$reportingInterval = 100;
$dbr = wfGetDB( DB_SLAVE );
# Don't generate extension images (e.g. Timeline)
$wgParser->clearTagHooks();
- # Don't use HTML tidy
- $wgUseTidy = false;
-
$what = $redirectsOnly ? "redirects" : "links";
if ( $oldRedirectsOnly ) {
'wgDisableTitleConversion' => false,
// Tidy options.
'wgUseTidy' => isset( $opts['tidy'] ),
- 'wgAlwaysUseTidy' => false,
+ 'wgTidyConfig' => null,
'wgDebugTidy' => false,
- 'wgTidyConf' => $IP . '/includes/tidy.conf',
+ 'wgTidyConf' => $IP . '/includes/tidy/tidy.conf',
'wgTidyOpts' => '',
'wgTidyInternal' => $this->tidySupport->isInternal(),
);
$wgHooks['ParserGetVariableValueTs'][] = 'ParserTest::getFakeTimestamp';
MagicWord::clearCache();
+ MWTidy::destroySingleton();
return $context;
}
FileBackendGroup::destroySingleton();
LockManagerGroup::destroySingletons();
LinkCache::singleton()->clear();
+ MWTidy::destroySingleton();
foreach ( $this->savedGlobals as $var => $val ) {
$GLOBALS[$var] = $val;
// of tidy. In that case however, we can not reliably detect whether a failing validation
// is due to malformed HTML, or caused by tidy not being installed as a command line tool.
// That would cause all HTML assertions to fail on a system that has no tidy installed.
- if ( !$GLOBALS['wgTidyInternal'] ) {
+ if ( !$GLOBALS['wgTidyInternal'] || !MWTidy::isEnabled() ) {
$this->markTestSkipped( 'Tidy extension not installed' );
}
'wgContLang' => $contLang,
'wgLang' => Language::factory( 'en' ),
'wgMemc' => new EmptyBagOStuff,
- 'wgAlwaysUseTidy' => false,
'wgCleanSignatures' => true,
) );
*/
class SanitizerTest extends MediaWikiTestCase {
+ protected function tearDown() {
+ MWTidy::destroySingleton();
+ parent::tearDown();
+ }
+
/**
* @covers Sanitizer::decodeCharReferences
*/
* @param bool $escaped Whether sanitizer let the tag in or escape it (ie: '<video>')
*/
public function testRemovehtmltagsOnHtml5Tags( $tag, $escaped ) {
- $this->setMwGlobals( array(
- 'wgUseTidy' => false
- ) );
+ MWTidy::setInstance( false );
if ( $escaped ) {
$this->assertEquals( "<$tag>",
* @covers Sanitizer::removeHTMLtags
*/
public function testRemoveHTMLtags( $input, $output, $msg = null ) {
- $GLOBALS['wgUseTidy'] = false;
+ MWTidy::setInstance( false );
$this->assertEquals( $output, Sanitizer::removeHTMLtags( $input ), $msg );
}
array( '<script>foo</script>', '<script>foo</script>' ),
);
}
-
}
* @group ContentHandler
*/
class TextContentHandlerTest extends MediaWikiLangTestCase {
-
public function testSupportsDirectEditing() {
$handler = new TextContentHandler();
$this->assertTrue( $handler->supportsDirectEditing(), 'direct editing is supported' );
CONTENT_MODEL_JAVASCRIPT,
),
'wgUseTidy' => false,
- 'wgAlwaysUseTidy' => false,
'wgCapitalLinks' => true,
'wgHooks' => array(), // bypass hook ContentGetParserOutput that force custom rendering
) );
+
+ MWTidy::destroySingleton();
+ }
+
+ protected function tearDown() {
+ MWTidy::destroySingleton();
+ parent::tearDown();
}
public function newContent( $text ) {
$this->djVuSupport = new DjVuSupport();
// Tidy support
$this->tidySupport = new TidySupport();
+ $tmpGlobals['wgTidyConfig'] = null;
$tmpGlobals['wgUseTidy'] = false;
- $tmpGlobals['wgAlwaysUseTidy'] = false;
$tmpGlobals['wgDebugTidy'] = false;
- $tmpGlobals['wgTidyConf'] = $IP . '/includes/tidy.conf';
+ $tmpGlobals['wgTidyConf'] = $IP . '/includes/tidy/tidy.conf';
$tmpGlobals['wgTidyOpts'] = '';
$tmpGlobals['wgTidyInternal'] = $this->tidySupport->isInternal();
$wgNamespaceAliases['Image'] = $this->savedWeirdGlobals['image_alias'];
$wgNamespaceAliases['Image_talk'] = $this->savedWeirdGlobals['image_talk_alias'];
+ MWTidy::destroySingleton();
+
// Restore backends
RepoGroup::destroySingleton();
FileBackendGroup::destroySingleton();
$GLOBALS[$var] = $val;
}
+ MWTidy::destroySingleton();
MagicWord::clearCache();
# The entries saved into RepoGroup cache with previous globals will be wrong.
return array( array( "foo<bar" ), array( "foo>bar" ), array( "foo\nbar" ), array( "foo\rbar" ) );
}
- protected function setUp() {
- parent::setUp();
-
- $this->setMwGlobals( 'wgAlwaysUseTidy', false );
- }
-
/**
* @dataProvider provideValidNames
* @covers Parser::setHook
protected function setUp() {
parent::setUp();
- $check = MWTidy::tidy( '' );
- if ( strpos( $check, '<!--' ) !== false ) {
+ if ( !MWTidy::isEnabled() ) {
$this->markTestSkipped( 'Tidy not found' );
}
}