From 2c6c954e23ec28345655ee41568d3a3f386a659f Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Mon, 31 Aug 2015 14:42:55 +1000 Subject: [PATCH] Abstract and refactor Tidy support * Split tidy implementations into a class hierarchy * Bring all tidy configuration into a single associative array and deprecate the old configuration. * Remove $wgAlwaysUseTidy This is preparatory to replacement of Tidy (T89331). I used the name "Raggett" for things relating to Dave Raggett's Tidy, since if we use "tidy" to mean the new abstract system as well as Raggett's tidy, it gets confusing. Change-Id: I77af1a16cbbb47fc226d05fb9aad56c58e8910b5 --- RELEASE-NOTES-1.26 | 1 + autoload.php | 7 +- includes/DefaultSettings.php | 53 +-- includes/Sanitizer.php | 4 +- includes/parser/MWTidy.php | 312 ++++-------------- includes/parser/Parser.php | 4 +- includes/tidy/RaggettBase.php | 47 +++ includes/tidy/RaggettExternal.php | 73 ++++ includes/tidy/RaggettInternalHHVM.php | 29 ++ includes/tidy/RaggettInternalPHP.php | 52 +++ includes/tidy/RaggettWrapper.php | 89 +++++ includes/tidy/TidyDriverBase.php | 40 +++ includes/{ => tidy}/tidy.conf | 0 maintenance/refreshLinks.php | 5 +- tests/parser/parserTest.inc | 6 +- tests/phpunit/MediaWikiTestCase.php | 2 +- tests/phpunit/includes/ExtraParserTest.php | 1 - tests/phpunit/includes/SanitizerTest.php | 12 +- .../content/TextContentHandlerTest.php | 1 - .../includes/content/TextContentTest.php | 8 +- .../phpunit/includes/parser/NewParserTest.php | 7 +- .../phpunit/includes/parser/TagHooksTest.php | 6 - tests/phpunit/includes/parser/TidyTest.php | 3 +- 23 files changed, 461 insertions(+), 301 deletions(-) create mode 100644 includes/tidy/RaggettBase.php create mode 100644 includes/tidy/RaggettExternal.php create mode 100644 includes/tidy/RaggettInternalHHVM.php create mode 100644 includes/tidy/RaggettInternalPHP.php create mode 100644 includes/tidy/RaggettWrapper.php create mode 100644 includes/tidy/TidyDriverBase.php rename includes/{ => tidy}/tidy.conf (100%) diff --git a/RELEASE-NOTES-1.26 b/RELEASE-NOTES-1.26 index b5d9d5a8a8..8f5db0ee33 100644 --- a/RELEASE-NOTES-1.26 +++ b/RELEASE-NOTES-1.26 @@ -29,6 +29,7 @@ production. * Fields in ParserOptions are now private. Use the accessors instead. * Custom LESS functions (defined via $wgResourceLoaderLESSFunctions) have been removed, after being deprecated in 1.24. +* $wgAlwaysUseTidy has been removed. === New features in 1.26 === * (T51506) Now action=info gives estimates of actual watchers for a page. diff --git a/autoload.php b/autoload.php index 82a45b49d5..62d6f09cf8 100644 --- a/autoload.php +++ b/autoload.php @@ -724,7 +724,6 @@ $wgAutoloadLocalClasses = array( 'MWOldPassword' => __DIR__ . '/includes/password/MWOldPassword.php', 'MWSaltedPassword' => __DIR__ . '/includes/password/MWSaltedPassword.php', 'MWTidy' => __DIR__ . '/includes/parser/MWTidy.php', - 'MWTidyWrapper' => __DIR__ . '/includes/parser/MWTidy.php', 'MWTimestamp' => __DIR__ . '/includes/MWTimestamp.php', 'MachineReadableRCFeedFormatter' => __DIR__ . '/includes/rcfeed/MachineReadableRCFeedFormatter.php', 'MagicWord' => __DIR__ . '/includes/MagicWord.php', @@ -761,6 +760,12 @@ $wgAutoloadLocalClasses = array( 'MediaWiki\\Logger\\Monolog\\WikiProcessor' => __DIR__ . '/includes/debug/logger/monolog/WikiProcessor.php', 'MediaWiki\\Logger\\NullSpi' => __DIR__ . '/includes/debug/logger/NullSpi.php', 'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php', + 'MediaWiki\\Tidy\\RaggettBase' => __DIR__ . '/includes/tidy/RaggettBase.php', + 'MediaWiki\\Tidy\\RaggettExternal' => __DIR__ . '/includes/tidy/RaggettExternal.php', + 'MediaWiki\\Tidy\\RaggettInternalHHVM' => __DIR__ . '/includes/tidy/RaggettInternalHHVM.php', + 'MediaWiki\\Tidy\\RaggettInternalPHP' => __DIR__ . '/includes/tidy/RaggettInternalPHP.php', + 'MediaWiki\\Tidy\\RaggettWrapper' => __DIR__ . '/includes/tidy/RaggettWrapper.php', + 'MediaWiki\\Tidy\\TidyDriverBase' => __DIR__ . '/includes/tidy/TidyDriverBase.php', 'MediaWiki\\Widget\\ComplexNamespaceInputWidget' => __DIR__ . '/includes/widget/ComplexNamespaceInputWidget.php', 'MediaWiki\\Widget\\NamespaceInputWidget' => __DIR__ . '/includes/widget/NamespaceInputWidget.php', 'MediaWiki\\Widget\\TitleInputWidget' => __DIR__ . '/includes/widget/TitleInputWidget.php', diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 06cd55a4ce..89c04f9c5b 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -4104,44 +4104,55 @@ $wgEnableImageWhitelist = true; $wgAllowImageTag = false; /** - * $wgUseTidy: use tidy to make sure HTML output is sane. - * Tidy is a free tool that fixes broken HTML. - * See http://www.w3.org/People/Raggett/tidy/ - * - * - $wgTidyBin should be set to the path of the binary and - * - $wgTidyConf to the path of the configuration file. - * - $wgTidyOpts can include any number of parameters. - * - $wgTidyInternal controls the use of the PECL extension or the - * libtidy (PHP >= 5) extension to use an in-process tidy library instead - * of spawning a separate program. - * Normally you shouldn't need to override the setting except for - * debugging. To install, use 'pear install tidy' and add a line - * 'extension=tidy.so' to php.ini. + * Configuration for HTML postprocessing tool. Set this to a configuration + * array to enable an external tool. Dave Raggett's "HTML Tidy" is typically + * used. See http://www.w3.org/People/Raggett/tidy/ + * + * If this is null and $wgUseTidy is true, the deprecated configuration + * parameters will be used instead. + * + * If this is null and $wgUseTidy is false, a pure PHP fallback will be used. + * + * Keys are: + * - driver: May be: + * - RaggettInternalHHVM: Use the limited-functionality HHVM extension + * - RaggettInternalPHP: Use the PECL extension + * - RaggettExternal: Shell out to an external binary (tidyBin) + * + * - tidyConfigFile: Path to configuration file for any of the Raggett drivers + * - debugComment: True to add a comment to the output with warning messages + * - tidyBin: For RaggettExternal, the path to the tidy binary. + * - tidyCommandLine: For RaggettExternal, additional command line options. */ -$wgUseTidy = false; +$wgTidyConfig = null; /** - * @see $wgUseTidy + * Set this to true to use the deprecated tidy configuration parameters. + * @deprecated use $wgTidyConfig */ -$wgAlwaysUseTidy = false; +$wgUseTidy = false; /** - * @see $wgUseTidy + * The path to the tidy binary. + * @deprecated Use $wgTidyConfig['tidyBin'] */ $wgTidyBin = 'tidy'; /** - * @see $wgUseTidy + * The path to the tidy config file + * @deprecated Use $wgTidyConfig['tidyConfigFile'] */ -$wgTidyConf = $IP . '/includes/tidy.conf'; +$wgTidyConf = $IP . '/includes/tidy/tidy.conf'; /** - * @see $wgUseTidy + * The command line options to the tidy binary + * @deprecated Use $wgTidyConfig['tidyCommandLine'] */ $wgTidyOpts = ''; /** - * @see $wgUseTidy + * Set this to true to use the tidy extension + * @deprecated Use $wgTidyConfig['driver'] */ $wgTidyInternal = extension_loaded( 'tidy' ); diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 8179905640..de63af7979 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -454,15 +454,13 @@ class Sanitizer { public static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) { - global $wgUseTidy; - extract( self::getRecognizedTagData( $extratags, $removetags ) ); # Remove HTML comments $text = Sanitizer::removeHTMLcomments( $text ); $bits = explode( '<', $text ); $text = str_replace( '>', '>', array_shift( $bits ) ); - if ( !$wgUseTidy ) { + if ( !MWTidy::isEnabled() ) { $tagstack = $tablestack = array(); foreach ( $bits as $x ) { $regs = array(); diff --git a/includes/parser/MWTidy.php b/includes/parser/MWTidy.php index e29ee88f6d..d0e50bc707 100644 --- a/includes/parser/MWTidy.php +++ b/includes/parser/MWTidy.php @@ -21,88 +21,6 @@ * @ingroup Parser */ -/** - * Class used to hide mw:editsection tokens from Tidy so that it doesn't break them - * or break on them. This is a bit of a hack for now, but hopefully in the future - * we may create a real postprocessor or something that will replace this. - * It's called wrapper because for now it basically takes over MWTidy::tidy's task - * of wrapping the text in a xhtml block - * - * This re-uses some of the parser's UNIQ tricks, though some of it is private so it's - * duplicated. Perhaps we should create an abstract marker hiding class. - * - * @ingroup Parser - */ -class MWTidyWrapper { - - /** - * @var ReplacementArray - */ - protected $mTokens; - - protected $mMarkerIndex; - - public function __construct() { - $this->mTokens = null; - } - - /** - * @param string $text - * @return string - */ - public function getWrapped( $text ) { - $this->mTokens = new ReplacementArray; - $this->mMarkerIndex = 0; - - // Replace elements with placeholders - $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX, - array( &$this, 'replaceCallback' ), $text ); - // ...and markers - $wrappedtext = preg_replace_callback( '/\<\\/?mw:toc\>/', - array( &$this, 'replaceCallback' ), $wrappedtext ); - // ... and tags - $wrappedtext = preg_replace_callback( '/\/s', - array( &$this, 'replaceCallback' ), $wrappedtext ); - // Modify inline Microdata and elements so they say and so - // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config - $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '' . - 'test' . $wrappedtext . ''; - - return $wrappedtext; - } - - /** - * @param array $m - * - * @return string - */ - public function replaceCallback( $m ) { - $marker = Parser::MARKER_PREFIX . "-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX; - $this->mMarkerIndex++; - $this->mTokens->setPair( $marker, $m[0] ); - return $marker; - } - - /** - * @param string $text - * @return string - */ - public function postprocess( $text ) { - // Revert back to <{link,meta}> - $text = preg_replace( '!]*?)(/{0,1}>)!', '<$1$2$3', $text ); - - // Restore the contents of placeholder tokens - $text = $this->mTokens->replace( $text ); - - return $text; - } - -} - /** * Class to interact with HTML tidy * @@ -113,8 +31,10 @@ class MWTidyWrapper { * @ingroup Parser */ class MWTidy { + private static $instance; + /** - * Interface with html tidy, used if $wgUseTidy = true. + * Interface with html tidy. * If tidy isn't able to correct the markup, the original will be * returned in all its glory with a warning comment appended. * @@ -122,23 +42,12 @@ class MWTidy { * @return string Corrected HTML output */ public static function tidy( $text ) { - $wrapper = new MWTidyWrapper; - $wrappedtext = $wrapper->getWrapped( $text ); - - $retVal = null; - $correctedtext = self::clean( $wrappedtext, false, $retVal ); - - if ( $retVal < 0 ) { - wfDebug( "Possible tidy configuration error!\n" ); - return $text . "\n\n"; - } elseif ( is_null( $correctedtext ) ) { - wfDebug( "Tidy error detected!\n" ); - return $text . "\n\n"; + $driver = self::singleton(); + if ( !$driver ) { + throw new MWException( __METHOD__. + ': tidy is disabled, caller should have checked MWTidy::isEnabled()' ); } - - $correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens - - return $correctedtext; + return $driver->tidy( $text ); } /** @@ -149,170 +58,77 @@ class MWTidy { * @return bool Whether the HTML is valid */ public static function checkErrors( $text, &$errorStr = null ) { - $retval = 0; - $errorStr = self::clean( $text, true, $retval ); - return ( $retval < 0 && $errorStr == '' ) || $retval == 0; + $driver = self::singleton(); + if ( !$driver ) { + throw new MWException( __METHOD__. + ': tidy is disabled, caller should have checked MWTidy::isEnabled()' ); + } + if ( $driver->supportsValidate() ) { + return $driver->validate( $text, $errorStr ); + } else { + throw new MWException( __METHOD__ . ": error text return from HHVM tidy is not supported" ); + } } - /** - * Perform a clean/repair operation - * @param string $text HTML to check - * @param bool $stderr Whether to read result from STDERR rather than STDOUT - * @param int &$retval Exit code (-1 on internal error) - * @return null|string - * @throws MWException - */ - private static function clean( $text, $stderr = false, &$retval = null ) { - global $wgTidyInternal; + public static function isEnabled() { + return self::singleton() !== false; + } - if ( $wgTidyInternal ) { - if ( wfIsHHVM() ) { - if ( $stderr ) { - throw new MWException( __METHOD__ . ": error text return from HHVM tidy is not supported" ); + protected static function singleton() { + global $wgUseTidy, $wgTidyInternal, $wgTidyConf, $wgDebugTidy, $wgTidyConfig, + $wgTidyBin, $wgTidyOpts; + + if ( self::$instance === null ) { + if ( $wgTidyConfig !== null ) { + $config = $wgTidyConfig; + } elseif ( $wgUseTidy ) { + // b/c configuration + $config = array( + 'tidyConfigFile' => $wgTidyConf, + 'debugComment' => $wgDebugTidy, + 'tidyBin' => $wgTidyBin, + 'tidyCommandLine' => $wgTidyOpts ); + if ( $wgTidyInternal ) { + if ( wfIsHHVM() ) { + $config['driver'] = 'RaggettInternalHHVM'; + } else { + $config['driver'] = 'RaggettInternalPHP'; + } + } else { + $config['driver'] = 'RaggettExternal'; } - return self::hhvmClean( $text, $retval ); } else { - return self::phpClean( $text, $stderr, $retval ); + return false; } - } else { - return self::externalClean( $text, $stderr, $retval ); - } - } - - /** - * Spawn an external HTML tidy process and get corrected markup back from it. - * Also called in OutputHandler.php for full page validation - * - * @param string $text HTML to check - * @param bool $stderr Whether to read result from STDERR rather than STDOUT - * @param int &$retval Exit code (-1 on internal error) - * @return string|null - */ - private static function externalClean( $text, $stderr = false, &$retval = null ) { - global $wgTidyConf, $wgTidyBin, $wgTidyOpts; - - $cleansource = ''; - $opts = ' -utf8'; - - if ( $stderr ) { - $descriptorspec = array( - 0 => array( 'pipe', 'r' ), - 1 => array( 'file', wfGetNull(), 'a' ), - 2 => array( 'pipe', 'w' ) - ); - } else { - $descriptorspec = array( - 0 => array( 'pipe', 'r' ), - 1 => array( 'pipe', 'w' ), - 2 => array( 'file', wfGetNull(), 'a' ) - ); - } - - $readpipe = $stderr ? 2 : 1; - $pipes = array(); - - $process = proc_open( - "$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes ); - - //NOTE: At least on linux, the process will be created even if tidy is not installed. - // This means that missing tidy will be treated as a validation failure. - - if ( is_resource( $process ) ) { - // Theoretically, this style of communication could cause a deadlock - // here. If the stdout buffer fills up, then writes to stdin could - // block. This doesn't appear to happen with tidy, because tidy only - // writes to stdout after it's finished reading from stdin. Search - // for tidyParseStdin and tidySaveStdout in console/tidy.c - fwrite( $pipes[0], $text ); - fclose( $pipes[0] ); - while ( !feof( $pipes[$readpipe] ) ) { - $cleansource .= fgets( $pipes[$readpipe], 1024 ); + switch ( $config['driver'] ) { + case 'RaggettInternalHHVM': + self::$instance = new MediaWiki\Tidy\RaggettInternalHHVM( $config ); + break; + case 'RaggettInternalPHP': + self::$instance = new MediaWiki\Tidy\RaggettInternalPHP( $config ); + break; + case 'RaggettExternal': + self::$instance = new MediaWiki\Tidy\RaggettExternal( $config ); + break; + default: + throw new MWException( "Invalid tidy driver: \"{$config['driver']}\"" ); } - fclose( $pipes[$readpipe] ); - $retval = proc_close( $process ); - } else { - wfWarn( "Unable to start external tidy process" ); - $retval = -1; } - - if ( !$stderr && $cleansource == '' && $text != '' ) { - // Some kind of error happened, so we couldn't get the corrected text. - // Just give up; we'll use the source text and append a warning. - $cleansource = null; - } - - return $cleansource; + return self::$instance; } /** - * Use the HTML tidy extension to use the tidy library in-process, - * saving the overhead of spawning a new process. - * - * @param string $text HTML to check - * @param bool $stderr Whether to read result from error status instead of output - * @param int &$retval Exit code (-1 on internal error) - * @return string|null + * Set the driver to be used. This is for testing. + * @param TidyDriverBase|false|null $instance */ - private static function phpClean( $text, $stderr = false, &$retval = null ) { - global $wgTidyConf, $wgDebugTidy; - - if ( ( !wfIsHHVM() && !class_exists( 'tidy' ) ) || - ( wfIsHHVM() && !function_exists( 'tidy_repair_string' ) ) - ) { - wfWarn( "Unable to load internal tidy class." ); - $retval = -1; - - return null; - } - - $tidy = new tidy; - $tidy->parseString( $text, $wgTidyConf, 'utf8' ); - - if ( $stderr ) { - $retval = $tidy->getStatus(); - return $tidy->errorBuffer; - } - - $tidy->cleanRepair(); - $retval = $tidy->getStatus(); - if ( $retval == 2 ) { - // 2 is magic number for fatal error - // http://www.php.net/manual/en/function.tidy-get-status.php - $cleansource = null; - } else { - $cleansource = tidy_get_output( $tidy ); - if ( $wgDebugTidy && $retval > 0 ) { - $cleansource .= "', '-->', $tidy->errorBuffer ) . - "\n-->"; - } - } - - return $cleansource; + public static function setInstance( $instance ) { + self::$instance = $instance; } /** - * Use the tidy extension for HHVM from - * https://github.com/wikimedia/mediawiki-php-tidy - * - * This currently does not support the object-oriented interface, but - * tidy_repair_string() can be used for the most common tasks. - * - * @param string $text HTML to check - * @param int &$retval Exit code (-1 on internal error) - * @return string|null + * Destroy the current singleton instance */ - private static function hhvmClean( $text, &$retval ) { - global $wgTidyConf; - - $cleansource = tidy_repair_string( $text, $wgTidyConf, 'utf8' ); - if ( $cleansource === false ) { - $cleansource = null; - $retval = -1; - } else { - $retval = 0; - } - - return $cleansource; + public static function destroySingleton() { + self::$instance = null; } } diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index ab5c58695c..677da63bd7 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -1283,8 +1283,6 @@ class Parser { * @return string */ private function internalParseHalfParsed( $text, $isMain = true, $linestart = true ) { - global $wgUseTidy, $wgAlwaysUseTidy; - $text = $this->mStripState->unstripGeneral( $text ); if ( $isMain ) { @@ -1335,7 +1333,7 @@ class Parser { $text = Sanitizer::normalizeCharReferences( $text ); - if ( ( $wgUseTidy && $this->mOptions->getTidy() ) || $wgAlwaysUseTidy ) { + if ( MWTidy::isEnabled() && $this->mOptions->getTidy() ) { $text = MWTidy::tidy( $text ); } else { # attempt to sanitize at least some nesting problems diff --git a/includes/tidy/RaggettBase.php b/includes/tidy/RaggettBase.php new file mode 100644 index 0000000000..a3717b2bf0 --- /dev/null +++ b/includes/tidy/RaggettBase.php @@ -0,0 +1,47 @@ +getWrapped( $text ); + + $retVal = null; + $correctedtext = $this->cleanWrapped( $wrappedtext, false, $retVal ); + + if ( $retVal < 0 ) { + wfDebug( "Possible tidy configuration error!\n" ); + return $text . "\n\n"; + } elseif ( is_null( $correctedtext ) ) { + wfDebug( "Tidy error detected!\n" ); + return $text . "\n\n"; + } + + $correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens + + return $correctedtext; + } + + public function validate( $text, &$errorStr ) { + $retval = 0; + $errorStr = $this->cleanWrapped( $text, true, $retval ); + return ( $retval < 0 && $errorStr == '' ) || $retval == 0; + } + + /** + * Perform a clean/repair operation + * @param string $text HTML to check + * @param bool $stderr Whether to read result from STDERR rather than STDOUT + * @param int &$retval Exit code (-1 on internal error) + * @return null|string + * @throws MWException + */ + abstract protected function cleanWrapped( $text, $stderr = false, &$retval = null ); +} diff --git a/includes/tidy/RaggettExternal.php b/includes/tidy/RaggettExternal.php new file mode 100644 index 0000000000..1193318891 --- /dev/null +++ b/includes/tidy/RaggettExternal.php @@ -0,0 +1,73 @@ + array( 'pipe', 'r' ), + 1 => array( 'file', wfGetNull(), 'a' ), + 2 => array( 'pipe', 'w' ) + ); + } else { + $descriptorspec = array( + 0 => array( 'pipe', 'r' ), + 1 => array( 'pipe', 'w' ), + 2 => array( 'file', wfGetNull(), 'a' ) + ); + } + + $readpipe = $stderr ? 2 : 1; + $pipes = array(); + + $process = proc_open( + "{$this->config['tidyBin']} -config {$this->config['tidyConfigFile']} " . + $this->config['tidyCommandLine'] . $opts, $descriptorspec, $pipes ); + + //NOTE: At least on linux, the process will be created even if tidy is not installed. + // This means that missing tidy will be treated as a validation failure. + + if ( is_resource( $process ) ) { + // Theoretically, this style of communication could cause a deadlock + // here. If the stdout buffer fills up, then writes to stdin could + // block. This doesn't appear to happen with tidy, because tidy only + // writes to stdout after it's finished reading from stdin. Search + // for tidyParseStdin and tidySaveStdout in console/tidy.c + fwrite( $pipes[0], $text ); + fclose( $pipes[0] ); + while ( !feof( $pipes[$readpipe] ) ) { + $cleansource .= fgets( $pipes[$readpipe], 1024 ); + } + fclose( $pipes[$readpipe] ); + $retval = proc_close( $process ); + } else { + wfWarn( "Unable to start external tidy process" ); + $retval = -1; + } + + if ( !$stderr && $cleansource == '' && $text != '' ) { + // Some kind of error happened, so we couldn't get the corrected text. + // Just give up; we'll use the source text and append a warning. + $cleansource = null; + } + + return $cleansource; + } + + public function supportsValidate() { + return true; + } +} diff --git a/includes/tidy/RaggettInternalHHVM.php b/includes/tidy/RaggettInternalHHVM.php new file mode 100644 index 0000000000..03860bec4b --- /dev/null +++ b/includes/tidy/RaggettInternalHHVM.php @@ -0,0 +1,29 @@ +config['tidyConfigFile'], 'utf8' ); + if ( $cleansource === false ) { + $cleansource = null; + $retval = -1; + } else { + $retval = 0; + } + + return $cleansource; + } +} diff --git a/includes/tidy/RaggettInternalPHP.php b/includes/tidy/RaggettInternalPHP.php new file mode 100644 index 0000000000..1ce14b60e1 --- /dev/null +++ b/includes/tidy/RaggettInternalPHP.php @@ -0,0 +1,52 @@ +parseString( $text, $this->config['tidyConfigFile'], 'utf8' ); + + if ( $stderr ) { + $retval = $tidy->getStatus(); + return $tidy->errorBuffer; + } + + $tidy->cleanRepair(); + $retval = $tidy->getStatus(); + if ( $retval == 2 ) { + // 2 is magic number for fatal error + // http://www.php.net/manual/en/function.tidy-get-status.php + $cleansource = null; + } else { + $cleansource = tidy_get_output( $tidy ); + if ( !empty( $this->config['debugComment'] ) && $retval > 0 ) { + $cleansource .= "', '-->', $tidy->errorBuffer ) . + "\n-->"; + } + } + + return $cleansource; + } + + public function supportsValidate() { + return true; + } +} diff --git a/includes/tidy/RaggettWrapper.php b/includes/tidy/RaggettWrapper.php new file mode 100644 index 0000000000..083f40203b --- /dev/null +++ b/includes/tidy/RaggettWrapper.php @@ -0,0 +1,89 @@ +mTokens = null; + } + + /** + * @param string $text + * @return string + */ + public function getWrapped( $text ) { + $this->mTokens = new ReplacementArray; + $this->mMarkerIndex = 0; + + // Replace elements with placeholders + $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX, + array( &$this, 'replaceCallback' ), $text ); + // ...and markers + $wrappedtext = preg_replace_callback( '/\<\\/?mw:toc\>/', + array( &$this, 'replaceCallback' ), $wrappedtext ); + // ... and tags + $wrappedtext = preg_replace_callback( '/\/s', + array( &$this, 'replaceCallback' ), $wrappedtext ); + // Modify inline Microdata and elements so they say and so + // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config + $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '' . + 'test' . $wrappedtext . ''; + + return $wrappedtext; + } + + /** + * @param array $m + * + * @return string + */ + public function replaceCallback( $m ) { + $marker = Parser::MARKER_PREFIX . "-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX; + $this->mMarkerIndex++; + $this->mTokens->setPair( $marker, $m[0] ); + return $marker; + } + + /** + * @param string $text + * @return string + */ + public function postprocess( $text ) { + // Revert back to <{link,meta}> + $text = preg_replace( '!]*?)(/{0,1}>)!', '<$1$2$3', $text ); + + // Restore the contents of placeholder tokens + $text = $this->mTokens->replace( $text ); + + return $text; + } + +} +?> diff --git a/includes/tidy/TidyDriverBase.php b/includes/tidy/TidyDriverBase.php new file mode 100644 index 0000000000..1d994aa1bd --- /dev/null +++ b/includes/tidy/TidyDriverBase.php @@ -0,0 +1,40 @@ +config = $config; + } + + /** + * Return true if validate() can be used + */ + public function supportsValidate() { + return false; + } + + /** + * Check HTML for errors, used if $wgValidateAllHtml = true. + * + * @param string $text + * @param string &$errorStr Return the error string + * @return bool Whether the HTML is valid + */ + public function validate( $text, &$errorStr ) { + throw new MWException( get_class( $this ) . " does not support validate()" ); + } + + /** + * Clean up HTML + * + * @param string HTML document fragment to clean up + * @param string The corrected HTML output + */ + public abstract function tidy( $text ); +} diff --git a/includes/tidy.conf b/includes/tidy/tidy.conf similarity index 100% rename from includes/tidy.conf rename to includes/tidy/tidy.conf diff --git a/maintenance/refreshLinks.php b/maintenance/refreshLinks.php index 763d97e00d..06e1449e15 100644 --- a/maintenance/refreshLinks.php +++ b/maintenance/refreshLinks.php @@ -73,7 +73,7 @@ class RefreshLinks extends Maintenance { private function doRefreshLinks( $start, $newOnly = false, $end = null, $redirectsOnly = false, $oldRedirectsOnly = false ) { - global $wgParser, $wgUseTidy; + global $wgParser; $reportingInterval = 100; $dbr = wfGetDB( DB_SLAVE ); @@ -88,9 +88,6 @@ class RefreshLinks extends Maintenance { # Don't generate extension images (e.g. Timeline) $wgParser->clearTagHooks(); - # Don't use HTML tidy - $wgUseTidy = false; - $what = $redirectsOnly ? "redirects" : "links"; if ( $oldRedirectsOnly ) { diff --git a/tests/parser/parserTest.inc b/tests/parser/parserTest.inc index cc0df7ab06..f429c30a72 100644 --- a/tests/parser/parserTest.inc +++ b/tests/parser/parserTest.inc @@ -889,9 +889,9 @@ class ParserTest { 'wgDisableTitleConversion' => false, // Tidy options. 'wgUseTidy' => isset( $opts['tidy'] ), - 'wgAlwaysUseTidy' => false, + 'wgTidyConfig' => null, 'wgDebugTidy' => false, - 'wgTidyConf' => $IP . '/includes/tidy.conf', + 'wgTidyConf' => $IP . '/includes/tidy/tidy.conf', 'wgTidyOpts' => '', 'wgTidyInternal' => $this->tidySupport->isInternal(), ); @@ -936,6 +936,7 @@ class ParserTest { $wgHooks['ParserGetVariableValueTs'][] = 'ParserTest::getFakeTimestamp'; MagicWord::clearCache(); + MWTidy::destroySingleton(); return $context; } @@ -1218,6 +1219,7 @@ class ParserTest { FileBackendGroup::destroySingleton(); LockManagerGroup::destroySingletons(); LinkCache::singleton()->clear(); + MWTidy::destroySingleton(); foreach ( $this->savedGlobals as $var => $val ) { $GLOBALS[$var] = $val; diff --git a/tests/phpunit/MediaWikiTestCase.php b/tests/phpunit/MediaWikiTestCase.php index ac214a29ef..7dc7027afb 100644 --- a/tests/phpunit/MediaWikiTestCase.php +++ b/tests/phpunit/MediaWikiTestCase.php @@ -1117,7 +1117,7 @@ abstract class MediaWikiTestCase extends PHPUnit_Framework_TestCase { // of tidy. In that case however, we can not reliably detect whether a failing validation // is due to malformed HTML, or caused by tidy not being installed as a command line tool. // That would cause all HTML assertions to fail on a system that has no tidy installed. - if ( !$GLOBALS['wgTidyInternal'] ) { + if ( !$GLOBALS['wgTidyInternal'] || !MWTidy::isEnabled() ) { $this->markTestSkipped( 'Tidy extension not installed' ); } diff --git a/tests/phpunit/includes/ExtraParserTest.php b/tests/phpunit/includes/ExtraParserTest.php index 77b26b3bd4..7b60fb3f74 100644 --- a/tests/phpunit/includes/ExtraParserTest.php +++ b/tests/phpunit/includes/ExtraParserTest.php @@ -22,7 +22,6 @@ class ExtraParserTest extends MediaWikiTestCase { 'wgContLang' => $contLang, 'wgLang' => Language::factory( 'en' ), 'wgMemc' => new EmptyBagOStuff, - 'wgAlwaysUseTidy' => false, 'wgCleanSignatures' => true, ) ); diff --git a/tests/phpunit/includes/SanitizerTest.php b/tests/phpunit/includes/SanitizerTest.php index c615c460c9..d3dc512bea 100644 --- a/tests/phpunit/includes/SanitizerTest.php +++ b/tests/phpunit/includes/SanitizerTest.php @@ -6,6 +6,11 @@ */ class SanitizerTest extends MediaWikiTestCase { + protected function tearDown() { + MWTidy::destroySingleton(); + parent::tearDown(); + } + /** * @covers Sanitizer::decodeCharReferences */ @@ -93,9 +98,7 @@ class SanitizerTest extends MediaWikiTestCase { * @param bool $escaped Whether sanitizer let the tag in or escape it (ie: '<video>') */ public function testRemovehtmltagsOnHtml5Tags( $tag, $escaped ) { - $this->setMwGlobals( array( - 'wgUseTidy' => false - ) ); + MWTidy::setInstance( false ); if ( $escaped ) { $this->assertEquals( "<$tag>", @@ -157,7 +160,7 @@ class SanitizerTest extends MediaWikiTestCase { * @covers Sanitizer::removeHTMLtags */ public function testRemoveHTMLtags( $input, $output, $msg = null ) { - $GLOBALS['wgUseTidy'] = false; + MWTidy::setInstance( false ); $this->assertEquals( $output, Sanitizer::removeHTMLtags( $input ), $msg ); } @@ -360,5 +363,4 @@ class SanitizerTest extends MediaWikiTestCase { array( '<script>foo</script>', '' ), ); } - } diff --git a/tests/phpunit/includes/content/TextContentHandlerTest.php b/tests/phpunit/includes/content/TextContentHandlerTest.php index 33861f11e3..492fec6b68 100644 --- a/tests/phpunit/includes/content/TextContentHandlerTest.php +++ b/tests/phpunit/includes/content/TextContentHandlerTest.php @@ -4,7 +4,6 @@ * @group ContentHandler */ class TextContentHandlerTest extends MediaWikiLangTestCase { - public function testSupportsDirectEditing() { $handler = new TextContentHandler(); $this->assertTrue( $handler->supportsDirectEditing(), 'direct editing is supported' ); diff --git a/tests/phpunit/includes/content/TextContentTest.php b/tests/phpunit/includes/content/TextContentTest.php index dd61f85b4e..fe263756a0 100644 --- a/tests/phpunit/includes/content/TextContentTest.php +++ b/tests/phpunit/includes/content/TextContentTest.php @@ -27,10 +27,16 @@ class TextContentTest extends MediaWikiLangTestCase { CONTENT_MODEL_JAVASCRIPT, ), 'wgUseTidy' => false, - 'wgAlwaysUseTidy' => false, 'wgCapitalLinks' => true, 'wgHooks' => array(), // bypass hook ContentGetParserOutput that force custom rendering ) ); + + MWTidy::destroySingleton(); + } + + protected function tearDown() { + MWTidy::destroySingleton(); + parent::tearDown(); } public function newContent( $text ) { diff --git a/tests/phpunit/includes/parser/NewParserTest.php b/tests/phpunit/includes/parser/NewParserTest.php index c7a310367b..0d2129b729 100644 --- a/tests/phpunit/includes/parser/NewParserTest.php +++ b/tests/phpunit/includes/parser/NewParserTest.php @@ -160,10 +160,10 @@ class NewParserTest extends MediaWikiTestCase { $this->djVuSupport = new DjVuSupport(); // Tidy support $this->tidySupport = new TidySupport(); + $tmpGlobals['wgTidyConfig'] = null; $tmpGlobals['wgUseTidy'] = false; - $tmpGlobals['wgAlwaysUseTidy'] = false; $tmpGlobals['wgDebugTidy'] = false; - $tmpGlobals['wgTidyConf'] = $IP . '/includes/tidy.conf'; + $tmpGlobals['wgTidyConf'] = $IP . '/includes/tidy/tidy.conf'; $tmpGlobals['wgTidyOpts'] = ''; $tmpGlobals['wgTidyInternal'] = $this->tidySupport->isInternal(); @@ -185,6 +185,8 @@ class NewParserTest extends MediaWikiTestCase { $wgNamespaceAliases['Image'] = $this->savedWeirdGlobals['image_alias']; $wgNamespaceAliases['Image_talk'] = $this->savedWeirdGlobals['image_talk_alias']; + MWTidy::destroySingleton(); + // Restore backends RepoGroup::destroySingleton(); FileBackendGroup::destroySingleton(); @@ -454,6 +456,7 @@ class NewParserTest extends MediaWikiTestCase { $GLOBALS[$var] = $val; } + MWTidy::destroySingleton(); MagicWord::clearCache(); # The entries saved into RepoGroup cache with previous globals will be wrong. diff --git a/tests/phpunit/includes/parser/TagHooksTest.php b/tests/phpunit/includes/parser/TagHooksTest.php index 3605e50f14..4af389852a 100644 --- a/tests/phpunit/includes/parser/TagHooksTest.php +++ b/tests/phpunit/includes/parser/TagHooksTest.php @@ -19,12 +19,6 @@ class TagHookTest extends MediaWikiTestCase { return array( array( "foobar" ), array( "foo\nbar" ), array( "foo\rbar" ) ); } - protected function setUp() { - parent::setUp(); - - $this->setMwGlobals( 'wgAlwaysUseTidy', false ); - } - /** * @dataProvider provideValidNames * @covers Parser::setHook diff --git a/tests/phpunit/includes/parser/TidyTest.php b/tests/phpunit/includes/parser/TidyTest.php index f656a74dbf..5db2908048 100644 --- a/tests/phpunit/includes/parser/TidyTest.php +++ b/tests/phpunit/includes/parser/TidyTest.php @@ -7,8 +7,7 @@ class TidyTest extends MediaWikiTestCase { protected function setUp() { parent::setUp(); - $check = MWTidy::tidy( '' ); - if ( strpos( $check, '