From 6ef0901a4e75fb1118d6abe11fbed2374ffd29f7 Mon Sep 17 00:00:00 2001 From: umherirrender Date: Mon, 23 Jul 2012 18:45:38 +0200 Subject: [PATCH] (bug 27567) Add file repo support to prop=duplicatefiles This adds a new method findBySha1s to the FileRepo classes to support the multiple hash search against all repos (with only one query for each db repo). Change-Id: I745cae7a1db3a32c20aa0067b744402fcf1a3122 --- RELEASE-NOTES-1.20 | 1 + includes/api/ApiQueryDuplicateFiles.php | 121 +++++++++++++----------- includes/filerepo/FileRepo.php | 18 ++++ includes/filerepo/LocalRepo.php | 33 +++++++ includes/filerepo/RepoGroup.php | 22 +++++ 5 files changed, 141 insertions(+), 54 deletions(-) diff --git a/RELEASE-NOTES-1.20 b/RELEASE-NOTES-1.20 index 2f463a763a..24d6b19d2c 100644 --- a/RELEASE-NOTES-1.20 +++ b/RELEASE-NOTES-1.20 @@ -198,6 +198,7 @@ upgrade PHP if you have not done so prior to upgrading MediaWiki. * (bug 36987) API avoids mangling fields in continuation parameters * (bug 30836) siteinfo prop=specialpagealiases will no longer return nonexistent special pages * (bug 38190) Add "required" flag to some token params for hint in api docs. +* (bug 27567) Add file repo support to prop=duplicatefiles. === Languages updated in 1.20 === diff --git a/includes/api/ApiQueryDuplicateFiles.php b/includes/api/ApiQueryDuplicateFiles.php index e05ffb6fae..719c84a7b4 100644 --- a/includes/api/ApiQueryDuplicateFiles.php +++ b/includes/api/ApiQueryDuplicateFiles.php @@ -59,73 +59,86 @@ class ApiQueryDuplicateFiles extends ApiQueryGeneratorBase { } $images = $namespaces[NS_FILE]; - $this->addTables( 'image', 'i1' ); - $this->addTables( 'image', 'i2' ); - $this->addFields( array( - 'i1.img_name AS orig_name', - 'i2.img_name AS dup_name', - 'i2.img_user_text AS dup_user_text', - 'i2.img_timestamp AS dup_timestamp' - ) ); - - $this->addWhere( array( - 'i1.img_name' => array_keys( $images ), - 'i1.img_sha1 = i2.img_sha1', - 'i1.img_name != i2.img_name', - ) ); + if( $params['dir'] == 'descending' ) { + $images = array_reverse( $images ); + } + $skipUntilThisDup = false; if ( isset( $params['continue'] ) ) { $cont = explode( '|', $params['continue'] ); if ( count( $cont ) != 2 ) { $this->dieUsage( 'Invalid continue param. You should pass the ' . 'original value returned by the previous query', '_badcontinue' ); } - $op = $params['dir'] == 'descending' ? '<' : '>'; - $db = $this->getDB(); - $orig = $db->addQuotes( $cont[0] ); - $dup = $db->addQuotes( $cont[1] ); - $this->addWhere( - "i1.img_name $op $orig OR " . - "(i1.img_name = $orig AND " . - "i2.img_name $op= $dup)" - ); + $fromImage = $cont[0]; + $skipUntilThisDup = $cont[1]; + // Filter out any images before $fromImage + foreach ( $images as $image => $pageId ) { + if ( $image < $fromImage ) { + unset( $images[$image] ); + } else { + break; + } + } } - $sort = ( $params['dir'] == 'descending' ? ' DESC' : '' ); - // Don't order by i1.img_name if it's constant in the WHERE clause - if ( count( $this->getPageSet()->getGoodTitles() ) == 1 ) { - $this->addOption( 'ORDER BY', 'i2.img_name' . $sort ); - } else { - $this->addOption( 'ORDER BY', array( - 'i1.img_name' . $sort, - 'i2.img_name' . $sort - )); - } - $this->addOption( 'LIMIT', $params['limit'] + 1 ); + $files = RepoGroup::singleton()->findFiles( array_keys( $images ) ); - $res = $this->select( __METHOD__ ); + $fit = true; $count = 0; $titles = array(); - foreach ( $res as $row ) { - if ( ++$count > $params['limit'] ) { - // We've reached the one extra which shows that - // there are additional pages to be had. Stop here... - $this->setContinueEnumParameter( 'continue', $row->orig_name . '|' . $row->dup_name ); - break; + + $sha1s = array(); + foreach ( $files as $file ) { + $sha1s[$file->getName()] = $file->getSha1(); + } + + // find all files with the hashes, result format is: array( hash => array( dup1, dup2 ), hash1 => ... ) + $filesBySha1s = RepoGroup::singleton()->findBySha1s( array_unique( array_values( $sha1s ) ) ); + + // iterate over $images to handle continue param correct + foreach( $images as $image => $pageId ) { + if( !isset( $sha1s[$image] ) ) { + continue; //file does not exist } - if ( !is_null( $resultPageSet ) ) { - $titles[] = Title::makeTitle( NS_FILE, $row->dup_name ); - } else { - $r = array( - 'name' => $row->dup_name, - 'user' => $row->dup_user_text, - 'timestamp' => wfTimestamp( TS_ISO_8601, $row->dup_timestamp ) - ); - $fit = $this->addPageSubItem( $images[$row->orig_name], $r ); - if ( !$fit ) { - $this->setContinueEnumParameter( 'continue', $row->orig_name . '|' . $row->dup_name ); + $sha1 = $sha1s[$image]; + $dupFiles = $filesBySha1s[$sha1]; + if( $params['dir'] == 'descending' ) { + $dupFiles = array_reverse( $dupFiles ); + } + foreach ( $dupFiles as $dupFile ) { + $dupName = $dupFile->getName(); + if( $image == $dupName ) { + continue; //ignore the file itself + } + if( $skipUntilThisDup !== false && $dupName < $skipUntilThisDup ) { + continue; //skip to pos after the image from continue param + } + $skipUntilThisDup = false; + if ( ++$count > $params['limit'] ) { + $fit = false; //break outer loop + // We're one over limit which shows that + // there are additional images to be had. Stop here... + $this->setContinueEnumParameter( 'continue', $image . '|' . $dupName ); break; } + if ( !is_null( $resultPageSet ) ) { + $titles[] = $file->getTitle(); + } else { + $r = array( + 'name' => $dupName, + 'user' => $dupFile->getUser( 'text' ), + 'timestamp' => wfTimestamp( TS_ISO_8601, $dupFile->getTimestamp() ) + ); + $fit = $this->addPageSubItem( $pageId, $r ); + if ( !$fit ) { + $this->setContinueEnumParameter( 'continue', $image . '|' . $dupName ); + break; + } + } + } + if( !$fit ) { + break; } } if ( !is_null( $resultPageSet ) ) { @@ -155,7 +168,7 @@ class ApiQueryDuplicateFiles extends ApiQueryGeneratorBase { public function getParamDescription() { return array( - 'limit' => 'How many files to return', + 'limit' => 'How many duplicate files to return', 'continue' => 'When more results are available, use this to continue', 'dir' => 'The direction in which to list', ); @@ -172,7 +185,7 @@ class ApiQueryDuplicateFiles extends ApiQueryGeneratorBase { } public function getDescription() { - return 'List all files that are duplicates of the given file(s)'; + return 'List all files that are duplicates of the given file(s) based on hash values'; } public function getPossibleErrors() { diff --git a/includes/filerepo/FileRepo.php b/includes/filerepo/FileRepo.php index 3d2ccf4fd1..5a550966f3 100644 --- a/includes/filerepo/FileRepo.php +++ b/includes/filerepo/FileRepo.php @@ -439,6 +439,24 @@ class FileRepo { return array(); } + /** + * Get an array of arrays or iterators of file objects for files that + * have the given SHA-1 content hashes. + * + * @param $hashes array An array of hashes + * @return array An Array of arrays or iterators of file objects and the hash as key + */ + public function findBySha1s( array $hashes ) { + $result = array(); + foreach ( $hashes as $hash ) { + $files = $this->findBySha1( $hash ); + if ( count( $files ) ) { + $result[$hash] = $files; + } + } + return $result; + } + /** * Get the public root URL of the repository * diff --git a/includes/filerepo/LocalRepo.php b/includes/filerepo/LocalRepo.php index dd0c947554..0954422d4f 100644 --- a/includes/filerepo/LocalRepo.php +++ b/includes/filerepo/LocalRepo.php @@ -248,6 +248,39 @@ class LocalRepo extends FileRepo { return $result; } + /** + * Get an array of arrays or iterators of file objects for files that + * have the given SHA-1 content hashes. + * + * Overrides generic implementation in FileRepo for performance reason + * + * @param $hashes array An array of hashes + * @return array An Array of arrays or iterators of file objects and the hash as key + */ + function findBySha1s( array $hashes ) { + if( !count( $hashes ) ) { + return array(); //empty parameter + } + + $dbr = $this->getSlaveDB(); + $res = $dbr->select( + 'image', + LocalFile::selectFields(), + array( 'img_sha1' => $hashes ), + __METHOD__, + array( 'ORDER BY' => 'img_name' ) + ); + + $result = array(); + foreach ( $res as $row ) { + $file = $this->newFileFromRow( $row ); + $result[$file->getSha1()][] = $file; + } + $res->free(); + + return $result; + } + /** * Get a connection to the slave DB * @return DatabaseBase diff --git a/includes/filerepo/RepoGroup.php b/includes/filerepo/RepoGroup.php index 6b31b7e54d..f9e5759920 100644 --- a/includes/filerepo/RepoGroup.php +++ b/includes/filerepo/RepoGroup.php @@ -263,6 +263,28 @@ class RepoGroup { return $result; } + /** + * Find all instances of files with this keys + * + * @param $hashes Array base 36 SHA-1 hashes + * @return Array of array of File objects + */ + function findBySha1s( array $hashes ) { + if ( !$this->reposInitialised ) { + $this->initialiseRepos(); + } + + $result = $this->localRepo->findBySha1s( $hashes ); + foreach ( $this->foreignRepos as $repo ) { + $result = array_merge_recursive( $result, $repo->findBySha1s( $hashes ) ); + } + //sort the merged (and presorted) sublist of each hash + foreach( $result as $hash => $files ) { + usort( $result[$hash], 'File::compare' ); + } + return $result; + } + /** * Get the repo instance with a given key. * @param $index string|int -- 2.20.1