From bc9e1a086e3933902633dc54f7aecd7d47fddcb9 Mon Sep 17 00:00:00 2001 From: Stanislav Malyshev Date: Mon, 21 Aug 2017 17:05:53 -0700 Subject: [PATCH] Create update SPARQL for category changes This script creates SPARQL UPDATE statements for changes in a given time period. These statements can be applied to an existing database to update it. See tests for examples of how the statements look like. Bug: T173774 Change-Id: I9867ad566c0619b55a48a011bd3c55321b1bfcff --- autoload.php | 1 + maintenance/categoryChangesAsRdf.php | 542 ++++++++++++++++++ .../phpunit/data/categoriesrdf/change.sparql | 16 + .../phpunit/data/categoriesrdf/delete.sparql | 10 + tests/phpunit/data/categoriesrdf/move.sparql | 24 + tests/phpunit/data/categoriesrdf/new.sparql | 19 + .../phpunit/data/categoriesrdf/restore.sparql | 10 + tests/phpunit/data/categoriesrdf/updatets.txt | 9 + .../maintenance/categoryChangesRdfTest.php | 263 +++++++++ 9 files changed, 894 insertions(+) create mode 100644 maintenance/categoryChangesAsRdf.php create mode 100644 tests/phpunit/data/categoriesrdf/change.sparql create mode 100644 tests/phpunit/data/categoriesrdf/delete.sparql create mode 100644 tests/phpunit/data/categoriesrdf/move.sparql create mode 100644 tests/phpunit/data/categoriesrdf/new.sparql create mode 100644 tests/phpunit/data/categoriesrdf/restore.sparql create mode 100644 tests/phpunit/data/categoriesrdf/updatets.txt create mode 100644 tests/phpunit/maintenance/categoryChangesRdfTest.php diff --git a/autoload.php b/autoload.php index bc0e69e718..f93d72331c 100644 --- a/autoload.php +++ b/autoload.php @@ -225,6 +225,7 @@ $wgAutoloadLocalClasses = [ 'CapsCleanup' => __DIR__ . '/maintenance/cleanupCaps.php', 'CategoriesRdf' => __DIR__ . '/includes/CategoriesRdf.php', 'Category' => __DIR__ . '/includes/Category.php', + 'CategoryChangesAsRdf' => __DIR__ . '/maintenance/categoryChangesAsRdf.php', 'CategoryFinder' => __DIR__ . '/includes/CategoryFinder.php', 'CategoryMembershipChange' => __DIR__ . '/includes/changes/CategoryMembershipChange.php', 'CategoryMembershipChangeJob' => __DIR__ . '/includes/jobqueue/jobs/CategoryMembershipChangeJob.php', diff --git a/maintenance/categoryChangesAsRdf.php b/maintenance/categoryChangesAsRdf.php new file mode 100644 index 0000000000..a12cda78b4 --- /dev/null +++ b/maintenance/categoryChangesAsRdf.php @@ -0,0 +1,542 @@ +addDescription( "Generate RDF dump of category changes in a wiki." ); + + $this->setBatchSize( 200 ); + $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false, + true, 'o' ); + $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.', + true, true, 's' ); + $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true, + true, 'e' ); + } + + /** + * Initialize external service classes. + */ + public function initialize() { + // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer. + $this->rdfWriter = new TurtleRdfWriter(); + $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); + } + + public function execute() { + global $wgRCMaxAge; + + $this->initialize(); + + $startTS = new MWTimestamp( $this->getOption( "start" ) ); + $endTS = new MWTimestamp( $this->getOption( "end" ) ); + $now = new MWTimestamp(); + + if ( $now->getTimestamp() - $startTS->getTimestamp() > $wgRCMaxAge ) { + $this->error( "Start timestamp too old, maximum RC age is $wgRCMaxAge!" ); + } + if ( $now->getTimestamp() - $endTS->getTimestamp() > $wgRCMaxAge ) { + $this->error( "End timestamp too old, maximum RC age is $wgRCMaxAge!" ); + } + + $this->startTS = $startTS->getTimestamp(); + $this->endTS = $endTS->getTimestamp(); + + $outFile = $this->getOption( 'output', 'php://stdout' ); + if ( $outFile === '-' ) { + $outFile = 'php://stdout'; + } + + $output = fopen( $outFile, 'wb' ); + + $this->categoriesRdf->setupPrefixes(); + $this->rdfWriter->start(); + + $prefixes = $this->getRdf(); + // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them + // Also strip dot at the end. + $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes ); + fwrite( $output, $prefixes ); + + $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); + + // Deletes go first because if the page was deleted, other changes + // do not matter. This only gets true deletes, i.e. not pages that were restored. + $this->handleDeletes( $dbr, $output ); + // Moves go before additions because if category is moved, we should not process creation + // as it would produce wrong data - because create row has old title + $this->handleMoves( $dbr, $output ); + // We need to handle restores too since delete may have happened in previous update. + $this->handleRestores( $dbr, $output ); + $this->handleAdds( $dbr, $output ); + $this->handleChanges( $dbr, $output ); + + // Update timestamp + fwrite( $output, $this->updateTS( $this->endTS ) ); + } + + /** + * Get SPARQL for updating set of categories + * @param IDatabase $dbr + * @param string[] $deleteUrls List of URIs to be deleted, with <> + * @param string[] $pages List of categories: id => title + * @param string $mark Marks which operation requests the query + * @return string SPARQL query + */ + private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) { + if ( empty( $deleteUrls ) ) { + return ""; + } + + if ( !empty( $pages ) ) { + $this->writeParentCategories( $dbr, $pages ); + } + + return "# $mark\n" . sprintf( self::SPARQL_DELETE_INSERT, + $this->getRdf(), + implode( ' ', $deleteUrls ) ); + } + + /** + * Write data for a set of categories + * @param IDatabase $dbr + * @param string[] $pages List of categories: id => title + */ + private function writeParentCategories( IDatabase $dbr, $pages ) { + foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) { + $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); + } + } + + /** + * Generate SPARQL Update code for updating dump timestamp + * @param string|int $timestamp Timestamp for last change + * @return string SPARQL Update query for timestamp. + */ + public function updateTS( $timestamp ) { + $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>'; + $ts = wfTimestamp( TS_ISO_8601, $timestamp ); + $tsQuery = <<mBatchSize + ); + $this->addTimestampConditions( $it, $dbr ); + $it->addJoinConditions( + [ + 'page_props' => [ + 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ] + ], + 'category' => [ + 'LEFT JOIN', [ 'cat_title = rc_title' ] + ] + ] + ); + $it->setFetchColumns( array_merge( $columns, [ + 'rc_title', + 'rc_cur_id', + 'pp_propname', + 'cat_pages', + 'cat_subcats', + 'cat_files' + ] ) ); + return $it; + } + + /** + * Fetch newly created categories + * @param IDatabase $dbr + * @return BatchRowIterator + */ + protected function getNewCatsIterator( IDatabase $dbr ) { + $it = $this->setupChangesIterator( $dbr ); + $it->addConditions( [ + 'rc_namespace' => NS_CATEGORY, + 'rc_new' => 1, + ] ); + return $it; + } + + /** + * Fetch moved categories + * @param IDatabase $dbr + * @return BatchRowIterator + */ + protected function getMovedCatsIterator( IDatabase $dbr ) { + $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] ); + $it->addConditions( [ + 'rc_namespace' => NS_CATEGORY, + 'rc_new' => 0, + 'rc_log_type' => 'move', + 'rc_type' => RC_LOG, + ] ); + $it->addJoinConditions( [ + 'page' => [ 'INNER JOIN', 'rc_cur_id = page_id' ], + ] ); + $this->addIndex( $it ); + return $it; + } + + /** + * Fetch deleted categories + * @param IDatabase $dbr + * @return BatchRowIterator + */ + protected function getDeletedCatsIterator( IDatabase $dbr ) { + $it = new BatchRowIterator( $dbr, + 'recentchanges', + [ 'rc_timestamp' ], + $this->mBatchSize + ); + $this->addTimestampConditions( $it, $dbr ); + $it->addConditions( [ + 'rc_namespace' => NS_CATEGORY, + 'rc_new' => 0, + 'rc_log_type' => 'delete', + 'rc_log_action' => 'delete', + 'rc_type' => RC_LOG, + // We will fetch ones that do not have page record. If they do, + // this means they were restored, thus restoring handler will pick it up. + 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)', + ] ); + $this->addIndex( $it ); + $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] ); + return $it; + } + + /** + * Fetch restored categories + * @param IDatabase $dbr + * @return BatchRowIterator + */ + protected function getRestoredCatsIterator( IDatabase $dbr ) { + $it = $this->setupChangesIterator( $dbr ); + $it->addConditions( [ + 'rc_namespace' => NS_CATEGORY, + 'rc_new' => 0, + 'rc_log_type' => 'delete', + 'rc_log_action' => 'restore', + 'rc_type' => RC_LOG, + // We will only fetch ones that have page record + 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)', + ] ); + $this->addIndex( $it ); + return $it; + } + + /** + * Fetch categorization changes + * @param IDatabase $dbr + * @return BatchRowIterator + */ + protected function getChangedCatsIterator( IDatabase $dbr ) { + $it = $this->setupChangesIterator( $dbr ); + $it->addConditions( [ + 'rc_namespace' => NS_CATEGORY, + 'rc_new' => 0, + 'rc_type' => [ RC_EDIT, RC_CATEGORIZE ], + ] ); + $this->addIndex( $it ); + return $it; + } + + /** + * Add timestamp limits to iterator + * @param BatchRowIterator $it Iterator + * @param IDatabase $dbr + */ + private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) { + $it->addConditions( [ + 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ), + 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ), + ] ); + } + + /** + * Need to force index, somehow on terbium the optimizer chooses wrong one + * @param BatchRowIterator $it + */ + private function addIndex( BatchRowIterator $it ) { + $it->addOptions( [ + 'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ] + ] ); + } + + /** + * Get iterator for links for categories. + * @param IDatabase $dbr + * @param array $ids List of page IDs + * @return Traversable + */ + protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) { + $it = new BatchRowIterator( + $dbr, + 'categorylinks', + [ 'cl_from', 'cl_to' ], + $this->mBatchSize + ); + $it->addConditions( [ + 'cl_type' => 'subcat', + 'cl_from' => $ids + ] ); + $it->setFetchColumns( [ 'cl_from', 'cl_to' ] ); + return new RecursiveIteratorIterator( $it ); + } + + /** + * Get accumulated RDF. + * @return string + */ + public function getRdf() { + return $this->rdfWriter->drain(); + } + + /** + * Handle category deletes. + * @param IDatabase $dbr + * @param resource $output File to write the output + */ + public function handleDeletes( IDatabase $dbr, $output ) { + // This only does "true" deletes - i.e. those that the page stays deleted + foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) { + $deleteUrls = []; + foreach ( $batch as $row ) { + // This can produce duplicates, we don't care + $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; + $this->processed[$row->rc_cur_id] = true; + } + fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) ); + } + } + + /** + * Write category data to RDF. + * @param stdclass $row Database row + */ + private function writeCategoryData( $row ) { + $this->categoriesRdf->writeCategoryData( + $row->rc_title, + $row->pp_propname === 'hiddencat', + (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, + (int)$row->cat_subcats + ); + } + + /** + * @param IDatabase $dbr + * @param resource $output + */ + public function handleMoves( IDatabase $dbr, $output ) { + foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) { + $pages = []; + $deleteUrls = []; + foreach ( $batch as $row ) { + $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; + + if ( isset( $this->processed[$row->rc_cur_id] ) ) { + // We already captured this one before + continue; + } + + if ( $row->page_namespace != NS_CATEGORY ) { + // If page was moved out of Category:, we'll just delete + continue; + } + $row->rc_title = $row->page_title; + $this->writeCategoryData( $row ); + $pages[$row->rc_cur_id] = $row->page_title; + $this->processed[$row->rc_cur_id] = true; + } + + fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) ); + } + } + + /** + * @param IDatabase $dbr + * @param resource $output + */ + public function handleRestores( IDatabase $dbr, $output ) { + fwrite( $output, "# Restores\n" ); + // This will only find those restores that were not deleted later. + foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) { + $pages = []; + foreach ( $batch as $row ) { + if ( isset( $this->processed[$row->rc_cur_id] ) ) { + // We already captured this one before + continue; + } + $this->writeCategoryData( $row ); + $pages[$row->rc_cur_id] = $row->rc_title; + $this->processed[$row->rc_cur_id] = true; + } + + if ( empty( $pages ) ) { + continue; + } + + $this->writeParentCategories( $dbr, $pages ); + + fwrite( $output, sprintf( self::SPARQL_INSERT, $this->getRdf() ) ); + } + } + + /** + * @param IDatabase $dbr + * @param resource $output + */ + public function handleAdds( IDatabase $dbr, $output ) { + fwrite( $output, "# Additions\n" ); + foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) { + $pages = []; + foreach ( $batch as $row ) { + if ( isset( $this->processed[$row->rc_cur_id] ) ) { + // We already captured this one before + continue; + } + $this->writeCategoryData( $row ); + $pages[$row->rc_cur_id] = $row->rc_title; + $this->processed[$row->rc_cur_id] = true; + } + + if ( empty( $pages ) ) { + continue; + } + + $this->writeParentCategories( $dbr, $pages ); + fwrite( $output, sprintf( self::SPARQL_INSERT, $this->getRdf() ) ); + } + } + + /** + * @param IDatabase $dbr + * @param resource $output + */ + public function handleChanges( IDatabase $dbr, $output ) { + foreach ( $this->getChangedCatsIterator( $dbr ) as $batch ) { + $pages = []; + $deleteUrls = []; + foreach ( $batch as $row ) { + if ( isset( $this->processed[$row->rc_cur_id] ) ) { + // We already captured this one before + continue; + } + $this->writeCategoryData( $row ); + $pages[$row->rc_cur_id] = $row->rc_title; + $this->processed[$row->rc_cur_id] = true; + $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; + } + + fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) ); + } + } +} + +$maintClass = CategoryChangesAsRdf::class; +require_once RUN_MAINTENANCE_IF_MAIN; diff --git a/tests/phpunit/data/categoriesrdf/change.sparql b/tests/phpunit/data/categoriesrdf/change.sparql new file mode 100644 index 0000000000..d7ec83a526 --- /dev/null +++ b/tests/phpunit/data/categoriesrdf/change.sparql @@ -0,0 +1,16 @@ +# Changes +DELETE { +?category ?x ?y +} INSERT { + + a mediawiki:Category ; + rdfs:label "Changed category" ; + mediawiki:pages "7"^^xsd:integer ; + mediawiki:subcategories "2"^^xsd:integer ; + mediawiki:isInCategory . + +} WHERE { + VALUES ?category { + + } +}; diff --git a/tests/phpunit/data/categoriesrdf/delete.sparql b/tests/phpunit/data/categoriesrdf/delete.sparql new file mode 100644 index 0000000000..7fb642dc33 --- /dev/null +++ b/tests/phpunit/data/categoriesrdf/delete.sparql @@ -0,0 +1,10 @@ +# Deletes +DELETE { +?category ?x ?y +} INSERT { + +} WHERE { + VALUES ?category { + + } +}; diff --git a/tests/phpunit/data/categoriesrdf/move.sparql b/tests/phpunit/data/categoriesrdf/move.sparql new file mode 100644 index 0000000000..c9f284ec11 --- /dev/null +++ b/tests/phpunit/data/categoriesrdf/move.sparql @@ -0,0 +1,24 @@ +# Moves +DELETE { +?category ?x ?y +} INSERT { + + a mediawiki:Category ; + rdfs:label "MovedTo" ; + mediawiki:pages "7"^^xsd:integer ; + mediawiki:subcategories "2"^^xsd:integer . + + a mediawiki:Category ; + rdfs:label "AlsoMoved" ; + mediawiki:pages "7"^^xsd:integer ; + mediawiki:subcategories "2"^^xsd:integer . + + mediawiki:isInCategory . + + mediawiki:isInCategory . + +} WHERE { + VALUES ?category { + + } +}; diff --git a/tests/phpunit/data/categoriesrdf/new.sparql b/tests/phpunit/data/categoriesrdf/new.sparql new file mode 100644 index 0000000000..f9a742d5c1 --- /dev/null +++ b/tests/phpunit/data/categoriesrdf/new.sparql @@ -0,0 +1,19 @@ +# Additions +INSERT DATA { + + a mediawiki:Category ; + rdfs:label "New category" ; + mediawiki:pages "7"^^xsd:integer ; + mediawiki:subcategories "2"^^xsd:integer . + + a mediawiki:Category, + mediawiki:HiddenCategory ; + rdfs:label "Новая категория 😃" ; + mediawiki:pages "7"^^xsd:integer ; + mediawiki:subcategories "2"^^xsd:integer . + + mediawiki:isInCategory . + + mediawiki:isInCategory . + +}; diff --git a/tests/phpunit/data/categoriesrdf/restore.sparql b/tests/phpunit/data/categoriesrdf/restore.sparql new file mode 100644 index 0000000000..16c0561463 --- /dev/null +++ b/tests/phpunit/data/categoriesrdf/restore.sparql @@ -0,0 +1,10 @@ +# Restores +INSERT DATA { + + a mediawiki:Category ; + rdfs:label "Restored cat" ; + mediawiki:pages "7"^^xsd:integer ; + mediawiki:subcategories "2"^^xsd:integer ; + mediawiki:isInCategory . + +}; diff --git a/tests/phpunit/data/categoriesrdf/updatets.txt b/tests/phpunit/data/categoriesrdf/updatets.txt new file mode 100644 index 0000000000..426bb92fb9 --- /dev/null +++ b/tests/phpunit/data/categoriesrdf/updatets.txt @@ -0,0 +1,9 @@ +DELETE { + schema:dateModified ?o . +} +WHERE { + schema:dateModified ?o . +}; +INSERT DATA { + schema:dateModified "2017-08-25T00:29:09Z"^^xsd:dateTime . +} diff --git a/tests/phpunit/maintenance/categoryChangesRdfTest.php b/tests/phpunit/maintenance/categoryChangesRdfTest.php new file mode 100644 index 0000000000..30a56f49d4 --- /dev/null +++ b/tests/phpunit/maintenance/categoryChangesRdfTest.php @@ -0,0 +1,263 @@ +setMwGlobals( [ + 'wgServer' => 'http://acme.test', + 'wgCanonicalServer' => 'http://acme.test', + 'wgArticlePath' => '/wiki/$1', + ] ); + } + + public function provideCategoryData() { + return [ + 'delete category' => [ + __DIR__ . "/../data/categoriesrdf/delete.sparql", + 'getDeletedCatsIterator', + 'handleDeletes', + [ + (object)[ 'rc_title' => 'Test', 'rc_cur_id' => 1, '_processed' => 1 ], + (object)[ 'rc_title' => 'Test 2', 'rc_cur_id' => 2, '_processed' => 2 ], + ], + ], + 'move category' => [ + __DIR__ . "/../data/categoriesrdf/move.sparql", + 'getMovedCatsIterator', + 'handleMoves', + [ + (object)[ + 'rc_title' => 'Test', + 'rc_cur_id' => 4, + 'page_title' => 'MovedTo', + 'page_namespace' => NS_CATEGORY, + '_processed' => 4, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + (object)[ + 'rc_title' => 'MovedTo', + 'rc_cur_id' => 4, + 'page_title' => 'MovedAgain', + 'page_namespace' => NS_CATEGORY, + 'pp_propname' => 'hiddencat', + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + (object)[ + 'rc_title' => 'Test 2', + 'rc_cur_id' => 5, + 'page_title' => 'AlsoMoved', + 'page_namespace' => NS_CATEGORY, + '_processed' => 5, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + (object)[ + 'rc_title' => 'Test 3', + 'rc_cur_id' => 6, + 'page_title' => 'MovedOut', + 'page_namespace' => NS_MAIN, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + (object)[ + 'rc_title' => 'Test 4', + 'rc_cur_id' => 7, + 'page_title' => 'Already Done', + 'page_namespace' => NS_CATEGORY, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + ], + [ 7 => true ], + ], + 'restore deleted category' => [ + __DIR__ . "/../data/categoriesrdf/restore.sparql", + 'getRestoredCatsIterator', + 'handleRestores', + [ + (object)[ + 'rc_title' => 'Restored cat', + 'rc_cur_id' => 10, + '_processed' => 10, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + (object)[ + 'rc_title' => 'Restored again', + 'rc_cur_id' => 10, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + (object)[ + 'rc_title' => 'Already seen', + 'rc_cur_id' => 11, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + ], + [ 11 => true ], + ], + 'new page' => [ + __DIR__ . "/../data/categoriesrdf/new.sparql", + 'getNewCatsIterator', + 'handleAdds', + [ + (object)[ + 'rc_title' => 'New category', + 'rc_cur_id' => 20, + '_processed' => 20, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + (object)[ + 'rc_title' => 'Новая категория 😃', + 'rc_cur_id' => 21, + '_processed' => 21, + 'pp_propname' => 'hiddencat', + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + (object)[ + 'rc_title' => 'Processed already', + 'rc_cur_id' => 22, + ], + ], + [ 22 => true ], + ], + 'change in categories' => [ + __DIR__ . "/../data/categoriesrdf/change.sparql", + 'getChangedCatsIterator', + 'handleChanges', + [ + (object)[ + 'rc_title' => 'Changed category', + 'rc_cur_id' => 30, + '_processed' => 30, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + (object)[ + 'rc_title' => 'Changed again', + 'rc_cur_id' => 30, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + (object)[ + 'rc_title' => 'Processed already', + 'rc_cur_id' => 31, + 'pp_propname' => null, + 'cat_pages' => 10, + 'cat_subcats' => 2, + 'cat_files' => 1, + ], + ], + [ 31 => true ], + ], + + ]; + } + + /** + * Mock category links iterator. + * @param $dbr + * @param array $ids + * @return array + */ + public function getCategoryLinksIterator( $dbr, array $ids ) { + $res = []; + foreach ( $ids as $pageid ) { + $res[] = (object)[ 'cl_from' => $pageid, 'cl_to' => "Parent of $pageid" ]; + } + return $res; + } + + /** + * @dataProvider provideCategoryData + * @param string $testFileName Name of the test, defines filename with expected results. + * @param string $iterator Iterator method name to mock + * @param string $handler Handler method to call + * @param array $result Result to be returned from mock iterator + * @param array $preProcessed List of pre-processed items + */ + public function testSparqlUpdate( $testFileName, $iterator, $handler, $result, + array $preProcessed = [] ) { + $dumpScript = + $this->getMockBuilder( CategoryChangesAsRdf::class ) + ->setMethods( [ $iterator, 'getCategoryLinksIterator' ] ) + ->getMock(); + + $dumpScript->expects( $this->any() ) + ->method( 'getCategoryLinksIterator' ) + ->willReturnCallback( [ $this, 'getCategoryLinksIterator' ] ); + + $dumpScript->expects( $this->once() ) + ->method( $iterator ) + ->willReturn( [ $result ] ); + + $ref = new ReflectionObject( $dumpScript ); + $processedProperty = $ref->getProperty( 'processed' ); + $processedProperty->setAccessible( true ); + $processedProperty->setValue( $dumpScript, $preProcessed ); + + $output = fopen( "php://memory", "w+b" ); + $dbr = wfGetDB( DB_REPLICA ); + /** @var CategoryChangesAsRdf $dumpScript */ + $dumpScript->initialize(); + $dumpScript->getRdf(); + $dumpScript->$handler( $dbr, $output ); + + rewind( $output ); + $sparql = stream_get_contents( $output ); + $this->assertFileContains( $testFileName, $sparql ); + + $processed = $processedProperty->getValue( $dumpScript ); + $expectedProcessed = $preProcessed; + foreach ( $result as $row ) { + if ( isset( $row->_processed ) ) { + $this->assertArrayHasKey( $row->_processed, $processed, + "ID {$row->_processed} was not processed!" ); + $expectedProcessed[] = $row->_processed; + } + } + $this->assertArrayEquals( $expectedProcessed, array_keys( $processed ), + 'Processed array has wrong items' ); + } + + public function testUpdateTs() { + $dumpScript = new CategoryChangesAsRdf(); + $dumpScript->initialize(); + $update = $dumpScript->updateTS( 1503620949 ); + $outFile = __DIR__ . '/../data/categoriesrdf/updatets.txt'; + $this->assertFileContains( $outFile, $update ); + } + +} -- 2.20.1