From fe36327916c42b887a6bd318b6a560b7458ca79e Mon Sep 17 00:00:00 2001 From: Platonides Date: Thu, 26 May 2011 18:49:25 +0000 Subject: [PATCH] Refactor the common code of compareParsers.php and preprocessDump.php into a dumpIterator.php script. Implement a simple 'search into this dump' --- maintenance/compareParsers.php | 69 ++------------- maintenance/dumpIterator.php | 149 +++++++++++++++++++++++++++++++++ maintenance/preprocessDump.php | 74 ++-------------- 3 files changed, 164 insertions(+), 128 deletions(-) create mode 100644 maintenance/dumpIterator.php diff --git a/maintenance/compareParsers.php b/maintenance/compareParsers.php index 7e60071b6e..55358464ef 100644 --- a/maintenance/compareParsers.php +++ b/maintenance/compareParsers.php @@ -27,9 +27,9 @@ * @ingroup Maintenance */ -require_once( dirname( __FILE__ ) . '/Maintenance.php' ); +require_once( dirname( __FILE__ ) . '/dumpIterator.php' ); -class CompareParsers extends Maintenance { +class CompareParsers extends DumpIterator { private $count = 0; private $startTime; @@ -40,9 +40,6 @@ class CompareParsers extends Maintenance { $this->mDescription = "Run a file or dump with several parsers"; $this->addOption( 'parser1', 'The first parser to compare.', true, true ); $this->addOption( 'parser2', 'The second parser to compare.', true, true ); - $this->addOption( 'file', 'File with text to run.', false, true ); - $this->addOption( 'dump', 'XML dump to execute all revisions.', false, true ); - $this->addOption( 'from', 'Article from XML dump to start from.', false, true ); $this->addOption( 'tidy', 'Run tidy on the articles.', false, false ); $this->addOption( 'save-failed', 'Folder in which articles which differ will be stored.', false, true ); $this->addOption( 'show-diff', 'Show a diff of the two renderings.', false, false ); @@ -51,11 +48,7 @@ class CompareParsers extends Maintenance { $this->addOption( 'show-parsed-output', 'Show the parsed html if both Parsers give the same output.', false, false ); } - public function execute() { - if (! ( $this->hasOption('file') ^ $this->hasOption('dump') ) ) { - $this->error("You must provide file or dump", true); - } - + public function checkOptions() { if ( $this->hasOption('save-failed') ) { $this->saveFailed = $this->getOption('save-failed'); } @@ -83,41 +76,13 @@ class CompareParsers extends Maintenance { $this->options->setTidy( true ); } - if ( $this->hasOption('file') ) { - $revision = new WikiRevision; - - $revision->setText( file_get_contents( $this->getOption('file') ) ); - $revision->setTitle( Title::newFromText( rawurldecode( basename( $this->getOption('file'), '.txt' ) ) ) ); - $this->handleRevision( $revision ); - return; - } - - $this->startTime = wfTime(); - - if ( $this->getOption('dump') == '-' ) { - $source = new ImportStreamSource( $this->getStdin() ); - } else { - $this->error("Sorry, I don't support dump filenames yet. Use - and provide it on stdin on the meantime.", true); - } - $importer = new WikiImporter( $source ); - - $importer->setRevisionCallback( - array( &$this, 'handleRevision' ) ); - - $this->from = $this->getOption( 'from', null ); - $this->count = 0; $this->failed = 0; - $importer->doImport(); - + } + + public function conclusions() { $this->error( "{$this->failed} failed revisions out of {$this->count}" ); if ($this->count > 0) $this->output( " (" . ( $this->failed / $this->count ) . "%)\n" ); - - $delta = wfTime() - $this->startTime; - $this->error( "Compared {$this->count} pages in " . round($delta, 2) . " seconds " ); - if ($delta > 0) - $this->error( round($this->count / $delta, 2) . " pages/sec" ); - $this->error( "\n" ); } function stripParameters( $text ) { @@ -131,25 +96,9 @@ class CompareParsers extends Maintenance { * Callback function for each revision, parse with both parsers and compare * @param $rev Revision */ - public function handleRevision( $rev ) { + public function processRevision( $rev ) { $title = $rev->getTitle(); - if ( !$title ) { - $this->error( "Got bogus revision with null title!" ); - return; - } - - $this->count++; - if ( isset( $this->from ) ) { - if ( $this->from != $title ) - return; - $this->output( "Skipped " . ($this->count - 1) . " pages\n" ); - - $this->count = 1; - $this->from = null; - } - - - + $parser1Name = $this->getOption( 'parser1' ); $parser2Name = $this->getOption( 'parser2' ); @@ -191,4 +140,4 @@ class CompareParsers extends Maintenance { } $maintClass = "CompareParsers"; -require_once( RUN_MAINTENANCE_IF_MAIN ); +require( RUN_MAINTENANCE_IF_MAIN ); diff --git a/maintenance/dumpIterator.php b/maintenance/dumpIterator.php new file mode 100644 index 0000000000..ab28ea4469 --- /dev/null +++ b/maintenance/dumpIterator.php @@ -0,0 +1,149 @@ +mDescription = "Does something with a dump"; + $this->addOption( 'file', 'File with text to run.', false, true ); + $this->addOption( 'dump', 'XML dump to execute all revisions.', false, true ); + $this->addOption( 'from', 'Article from XML dump to start from.', false, true ); + } + + public function execute() { + if (! ( $this->hasOption('file') ^ $this->hasOption('dump') ) ) { + $this->error("You must provide a file or dump", true); + } + + $this->checkOptions(); + + if ( $this->hasOption('file') ) { + $revision = new WikiRevision; + + $revision->setText( file_get_contents( $this->getOption( 'file' ) ) ); + $revision->setTitle( Title::newFromText( rawurldecode( basename( $this->getOption( 'file' ), '.txt' ) ) ) ); + $this->handleRevision( $revision ); + return; + } + + $this->startTime = wfTime(); + + if ( $this->getOption('dump') == '-' ) { + $source = new ImportStreamSource( $this->getStdin() ); + } else { + $this->error("Sorry, I don't support dump filenames yet. Use - and provide it on stdin on the meantime.", true); + } + $importer = new WikiImporter( $source ); + + $importer->setRevisionCallback( + array( &$this, 'handleRevision' ) ); + + $this->from = $this->getOption( 'from', null ); + $this->count = 0; + $importer->doImport(); + + $this->conclusions(); + + $delta = wfTime() - $this->startTime; + $this->error( "Done {$this->count} revisions in " . round($delta, 2) . " seconds " ); + if ($delta > 0) + $this->error( round($this->count / $delta, 2) . " pages/sec" ); + + # Perform the memory_get_peak_usage() when all the other data has been output so there's no damage if it dies. + # It is only available since 5.2.0 (since 5.2.1 if you haven't compiled with --enable-memory-limit) + $this->error( "Memory peak usage of " . memory_get_peak_usage() . " bytes\n" ); + } + + function stripParameters( $text ) { + if ( !$this->stripParametersEnabled ) { + return $text; + } + return preg_replace( '/(]+>/', '$1>', $text ); + } + + /** + * Callback function for each revision, child classes should override + * processRevision instead. + * @param $rev Revision + */ + public function handleRevision( $rev ) { + $title = $rev->getTitle(); + if ( !$title ) { + $this->error( "Got bogus revision with null title!" ); + return; + } + + $this->count++; + if ( isset( $this->from ) ) { + if ( $this->from != $title ) + return; + $this->output( "Skipped " . ($this->count - 1) . " pages\n" ); + + $this->count = 1; + $this->from = null; + } + + $this->processRevision( $rev ); + } + + /* Stub function for processing additional options */ + public function checkOptions() { + return; + } + + /* Stub function for giving data about what was computed */ + public function conclusions() { + return; + } + + /* Core function which does whatever the maintenance script is designed to do */ + abstract public function processRevision( $rev ); +} + +class SearchDump extends DumpIterator { + + public function __construct() { + parent::__construct(); + $this->mDescription = "Runs a regex in the revisions from a dump"; + $this->addOption( 'regex', 'Searching regex', true, true ); + } + + public function processRevision( $rev ) { + if ( preg_match( $this->getOption( 'regex' ), $rev->getText() ) ) { + $this->output( $rev->getTitle() . " matches at edit from " . $rev->getTimestamp() . "\n" ); + } + } +} + +$maintClass = "SearchDump"; +require_once( RUN_MAINTENANCE_IF_MAIN ); diff --git a/maintenance/preprocessDump.php b/maintenance/preprocessDump.php index 27d11dcda0..a6c3839682 100644 --- a/maintenance/preprocessDump.php +++ b/maintenance/preprocessDump.php @@ -25,12 +25,9 @@ * @ingroup Maintenance */ -require_once( dirname( __FILE__ ) . '/Maintenance.php' ); +require_once( dirname( __FILE__ ) . '/dumpIterator.php' ); -class PreprocessDump extends Maintenance { - - private $count = 0; - private $startTime; +class PreprocessDump extends DumpIterator { /* Variables for dressing up as a parser */ public $mTitle = 'PreprocessDump'; @@ -43,11 +40,6 @@ class PreprocessDump extends Maintenance { public function __construct() { parent::__construct(); - $this->saveFailed = false; - $this->mDescription = "Run a file or dump with a preprocessor"; - $this->addOption( 'file', 'File with text to run.', false, true ); - $this->addOption( 'dump', 'XML dump to execute all revisions.', false, true ); - $this->addOption( 'from', 'Article from XML dump to start from.', false, true ); $this->addOption( 'cache', 'Use and populate the preprocessor cache.', false, false ); $this->addOption( 'preprocessor', 'Preprocessor to use.', false, false ); } @@ -72,16 +64,11 @@ class PreprocessDump extends Maintenance { return false; } - public function execute() { + public function checkOptions() { global $wgParser, $wgParserConf, $wgPreprocessorCacheThreshold; - - if (! ( $this->hasOption( 'file' ) ^ $this->hasOption( 'dump' ) ) ) { - $this->error("You must provide a file or dump", true); - } if ( !$this->hasOption( 'cache' ) ) { $wgPreprocessorCacheThreshold = false; - $this->saveFailed = $this->getOption('save-failed'); } if ( $this->hasOption( 'preprocessor' ) ) { @@ -94,71 +81,22 @@ class PreprocessDump extends Maintenance { $wgParser->firstCallInit(); $this->mPreprocessor = new $name( $this ); - - if ( $this->hasOption( 'file' ) ) { - $revision = new WikiRevision; - - $revision->setText( file_get_contents( $this->getOption( 'file' ) ) ); - $revision->setTitle( Title::newFromText( rawurldecode( basename( $this->getOption('file'), '.txt' ) ) ) ); - $this->handleRevision( $revision ); - return; - } - - $this->startTime = wfTime(); - - if ( $this->getOption('dump') == '-' ) { - $source = new ImportStreamSource( $this->getStdin() ); - } else { - $this->error("Sorry, I don't support dump filenames yet. Use - and provide it on stdin on the meantime.", true); - } - $importer = new WikiImporter( $source ); - - $importer->setRevisionCallback( - array( &$this, 'handleRevision' ) ); - - $this->from = $this->getOption( 'from', null ); - $this->count = 0; - $importer->doImport(); - - $delta = wfTime() - $this->startTime; - $this->error( "{$this->count} revisions preprocessed in " . round($delta, 2) . " seconds " ); - if ($delta > 0) - $this->error( round($this->count / $delta, 2) . " pages/sec" ); - - # Perform the memory_get_peak_usage() when all the other data has been output so there's no damage if it dies. - # It is only available since 5.2.0 (since 5.2.1 if you haven't compiled with --enable-memory-limit) - $this->error( "Memory peak usage of " . memory_get_peak_usage() . " bytes\n" ); } /** * Callback function for each revision, preprocessToObj() * @param $rev Revision */ - public function handleRevision( $rev ) { - $title = $rev->getTitle(); - if ( !$title ) { - $this->error( "Got bogus revision with null title!" ); - return; - } - - $this->count++; - if ( isset( $this->from ) ) { - if ( $this->from != $title ) - return; - $this->output( "Skipped " . ($this->count - 1) . " pages\n" ); - - $this->count = 1; - $this->from = null; - } + public function processRevision( $rev ) { try { $this->mPreprocessor->preprocessToObj( $rev->getText(), 0 ); } catch(Exception $e) { - $this->error("Caught exception " . $e->getMessage() . " in " . $title-> getPrefixedText() ); + $this->error("Caught exception " . $e->getMessage() . " in " . $rev->getTitle()->getPrefixedText() ); } } } $maintClass = "PreprocessDump"; -require_once( RUN_MAINTENANCE_IF_MAIN ); +require( RUN_MAINTENANCE_IF_MAIN ); -- 2.20.1