Merge "Cache countable statistics to prevent multiple counting on import"
authorLegoktm <legoktm.wikipedia@gmail.com>
Fri, 13 Feb 2015 19:54:17 +0000 (19:54 +0000)
committerGerrit Code Review <gerrit@wikimedia.org>
Fri, 13 Feb 2015 19:54:17 +0000 (19:54 +0000)
1  2 
includes/Import.php
includes/page/WikiPage.php

diff --combined includes/Import.php
@@@ -42,13 -42,15 +42,15 @@@ class WikiImporter 
        private $config;
        /** @var ImportTitleFactory */
        private $importTitleFactory;
+       /** @var array */
+       private $countableCache = array();
  
        /**
         * Creates an ImportXMLReader drawing from the source provided
 -       * @param ImportStreamSource $source
 +       * @param ImportSource $source
         * @param Config $config
         */
 -      function __construct( ImportStreamSource $source, Config $config = null ) {
 +      function __construct( ImportSource $source, Config $config = null ) {
                $this->reader = new XMLReader();
                if ( !$config ) {
                        wfDeprecated( __METHOD__ . ' without a Config instance', '1.25' );
@@@ -67,6 -69,7 +69,7 @@@
                }
  
                // Default callbacks
+               $this->setPageCallback( array( $this, 'beforeImportPage' ) );
                $this->setRevisionCallback( array( $this, "importRevision" ) );
                $this->setUploadCallback( array( $this, 'importUpload' ) );
                $this->setLogItemCallback( array( $this, 'importLogItem' ) );
                $this->mImportUploads = $import;
        }
  
+       /**
+        * Default per-page callback. Sets up some things related to site statistics
+        * @param array $titleAndForeignTitle Two-element array, with Title object at
+        * index 0 and ForeignTitle object at index 1
+        * @return bool
+        */
+       public function beforeImportPage( $titleAndForeignTitle ) {
+               $title = $titleAndForeignTitle[0];
+               $page = WikiPage::factory( $title );
+               $this->countableCache['title_' . $title->getPrefixedText()] = $page->isCountable();
+               return true;
+       }
        /**
         * Default per-revision callback, performs the import.
         * @param WikiRevision $revision
         */
        public function finishImportPage( $title, $foreignTitle, $revCount,
                        $sRevCount, $pageInfo ) {
+               // Update article count statistics (T42009)
+               // The normal counting logic in WikiPage->doEditUpdates() is designed for
+               // one-revision-at-a-time editing, not bulk imports. In this situation it
+               // suffers from issues of slave lag. We let WikiPage handle the total page
+               // and revision count, and we implement our own custom logic for the
+               // article (content page) count.
+               $page = WikiPage::factory( $title );
+               $page->loadPageData( 'fromdbmaster' );
+               $content = $page->getContent();
+               $editInfo = $page->prepareContentForEdit( $content );
+               $countable = $page->isCountable( $editInfo );
+               $oldcountable = $this->countableCache['title_' . $title->getPrefixedText()];
+               if ( isset( $oldcountable ) && $countable != $oldcountable ) {
+                       DeferredUpdates::addUpdate( SiteStatsUpdate::factory( array(
+                               'articles' => ( (int)$countable - (int)$oldcountable )
+                       ) ) );
+               }
                $args = func_get_args();
                return Hooks::run( 'AfterImportPage', $args );
        }
  
                $keepReading = $this->reader->read();
                $skip = false;
 -              while ( $keepReading ) {
 -                      $tag = $this->reader->name;
 -                      $type = $this->reader->nodeType;
 -
 -                      if ( !Hooks::run( 'ImportHandleToplevelXMLTag', array( $this ) ) ) {
 -                              // Do nothing
 -                      } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
 -                              break;
 -                      } elseif ( $tag == 'siteinfo' ) {
 -                              $this->handleSiteInfo();
 -                      } elseif ( $tag == 'page' ) {
 -                              $this->handlePage();
 -                      } elseif ( $tag == 'logitem' ) {
 -                              $this->handleLogItem();
 -                      } elseif ( $tag != '#text' ) {
 -                              $this->warn( "Unhandled top-level XML tag $tag" );
 -
 -                              $skip = true;
 -                      }
 +              $rethrow = null;
 +              try {
 +                      while ( $keepReading ) {
 +                              $tag = $this->reader->name;
 +                              $type = $this->reader->nodeType;
 +
 +                              if ( !Hooks::run( 'ImportHandleToplevelXMLTag', array( $this ) ) ) {
 +                                      // Do nothing
 +                              } elseif ( $tag == 'mediawiki' && $type == XMLReader::END_ELEMENT ) {
 +                                      break;
 +                              } elseif ( $tag == 'siteinfo' ) {
 +                                      $this->handleSiteInfo();
 +                              } elseif ( $tag == 'page' ) {
 +                                      $this->handlePage();
 +                              } elseif ( $tag == 'logitem' ) {
 +                                      $this->handleLogItem();
 +                              } elseif ( $tag != '#text' ) {
 +                                      $this->warn( "Unhandled top-level XML tag $tag" );
 +
 +                                      $skip = true;
 +                              }
  
 -                      if ( $skip ) {
 -                              $keepReading = $this->reader->next();
 -                              $skip = false;
 -                              $this->debug( "Skip" );
 -                      } else {
 -                              $keepReading = $this->reader->read();
 +                              if ( $skip ) {
 +                                      $keepReading = $this->reader->next();
 +                                      $skip = false;
 +                                      $this->debug( "Skip" );
 +                              } else {
 +                                      $keepReading = $this->reader->read();
 +                              }
                        }
 +              } catch ( Exception $ex ) {
 +                      $rethrow = $ex;
                }
  
 +              // finally
                libxml_disable_entity_loader( $oldDisable );
 +              $this->reader->close();
 +
 +              if ( $rethrow ) {
 +                      throw $rethrow;
 +              }
 +
                return true;
        }
  
@@@ -979,10 -1003,10 +1015,10 @@@ class UploadSourceAdapter 
        private $mPosition;
  
        /**
 -       * @param ImportStreamSource $source
 +       * @param ImportSource $source
         * @return string
         */
 -      static function registerSource( ImportStreamSource $source ) {
 +      static function registerSource( ImportSource $source ) {
                $id = wfRandomString();
  
                self::$sourceRegistrations[$id] = $source;
@@@ -1544,7 -1568,6 +1580,6 @@@ class WikiRevision 
                                        $this->title->getPrefixedText() . "]], timestamp " . $this->timestamp . "\n" );
                                return false;
                        }
-                       $oldcountable = $page->isCountable();
                }
  
                # @todo FIXME: Use original rev_id optionally (better for backups)
  
                if ( $changed !== false && !$this->mNoUpdates ) {
                        wfDebug( __METHOD__ . ": running updates\n" );
+                       // countable/oldcountable stuff is handled in WikiImporter::finishImportPage
                        $page->doEditUpdates(
                                $revision,
                                $userObj,
-                               array( 'created' => $created, 'oldcountable' => $oldcountable )
+                               array( 'created' => $created, 'oldcountable' => 'no-change' )
                        );
                }
  
                        wfDebug( __METHOD__ . ": Successful\n" );
                        return true;
                } else {
 -                      wfDebug( __METHOD__ . ': failed: ' . $status->getXml() . "\n" );
 +                      wfDebug( __METHOD__ . ': failed: ' . $status->getHTML() . "\n" );
                        return false;
                }
        }
  
  }
  
 +/**
 + * Source interface for XML import.
 + */
 +interface ImportSource {
 +
 +      /**
 +       * Indicates whether the end of the input has been reached.
 +       * Will return true after a finite number of calls to readChunk.
 +       *
 +       * @return bool true if there is no more input, false otherwise.
 +       */
 +      function atEnd();
 +
 +      /**
 +       * Return a chunk of the input, as a (possibly empty) string.
 +       * When the end of input is reached, readChunk() returns false.
 +       * If atEnd() returns false, readChunk() will return a string.
 +       * If atEnd() returns true, readChunk() will return false.
 +       *
 +       * @return bool|string
 +       */
 +      function readChunk();
 +}
 +
  /**
   * Used for importing XML dumps where the content of the dump is in a string.
   * This class is ineffecient, and should only be used for small dumps.
   *
   * @ingroup SpecialPage
   */
 -class ImportStringSource {
 +class ImportStringSource implements ImportSource {
        function __construct( $string ) {
                $this->mString = $string;
                $this->mRead = false;
   * Imports a XML dump from a file (either from file upload, files on disk, or HTTP)
   * @ingroup SpecialPage
   */
 -class ImportStreamSource {
 +class ImportStreamSource implements ImportSource {
        function __construct( $handle ) {
                $this->mHandle = $handle;
        }
@@@ -557,6 -557,7 +557,6 @@@ class WikiPage implements Page, IDBAcce
         * @return Revision|null
         */
        public function getOldestRevision() {
 -              wfProfileIn( __METHOD__ );
  
                // Try using the slave database first, then try the master
                $continue = 2;
                        }
                }
  
 -              wfProfileOut( __METHOD__ );
                return $row ? Revision::newFromRow( $row ) : null;
        }
  
         * @return array Array of authors, duplicates not removed
         */
        public function getLastNAuthors( $num, $revLatest = 0 ) {
 -              wfProfileIn( __METHOD__ );
                // First try the slave
                // If that doesn't have the latest revision, try the master
                $continue = 2;
                        );
  
                        if ( !$res ) {
 -                              wfProfileOut( __METHOD__ );
                                return array();
                        }
  
                        $authors[] = $row->rev_user_text;
                }
  
 -              wfProfileOut( __METHOD__ );
                return $authors;
        }
  
         * @return ParserOutput|bool ParserOutput or false if the revision was not found
         */
        public function getParserOutput( ParserOptions $parserOptions, $oldid = null ) {
 -              wfProfileIn( __METHOD__ );
  
                $useParserCache = $this->isParserCacheUsed( $parserOptions, $oldid );
                wfDebug( __METHOD__ . ': using parser cache: ' . ( $useParserCache ? 'yes' : 'no' ) . "\n" );
                if ( $useParserCache ) {
                        $parserOutput = ParserCache::singleton()->get( $this, $parserOptions );
                        if ( $parserOutput !== false ) {
 -                              wfProfileOut( __METHOD__ );
                                return $parserOutput;
                        }
                }
                $pool = new PoolWorkArticleView( $this, $parserOptions, $oldid, $useParserCache );
                $pool->execute();
  
 -              wfProfileOut( __METHOD__ );
 -
                return $pool->getParserOutput();
        }
  
         * @return int The newly created page_id key, or false if the title already existed
         */
        public function insertOn( $dbw ) {
 -              wfProfileIn( __METHOD__ );
  
                $page_id = $dbw->nextSequenceValue( 'page_page_id_seq' );
                $dbw->insert( 'page', array(
                        $this->mId = $newid;
                        $this->mTitle->resetArticleID( $newid );
                }
 -              wfProfileOut( __METHOD__ );
  
                return $affected ? $newid : false;
        }
        ) {
                global $wgContentHandlerUseDB;
  
 -              wfProfileIn( __METHOD__ );
 -
                $content = $revision->getContent();
                $len = $content ? $content->getSize() : 0;
                $rt = $content ? $content->getUltimateRedirectTarget() : null;
                                                                                                        $this->mLatest, $revision->getContentModel() );
                }
  
 -              wfProfileOut( __METHOD__ );
                return $result;
        }
  
                        return true;
                }
  
 -              wfProfileIn( __METHOD__ );
                if ( $isRedirect ) {
                        $this->insertRedirectEntry( $redirectTitle );
                } else {
                if ( $this->getTitle()->getNamespace() == NS_FILE ) {
                        RepoGroup::singleton()->getLocalRepo()->invalidateImageRedirect( $this->getTitle() );
                }
 -              wfProfileOut( __METHOD__ );
  
                return ( $dbw->affectedRows() != 0 );
        }
         * @return bool
         */
        public function updateIfNewerOn( $dbw, $revision ) {
 -              wfProfileIn( __METHOD__ );
  
                $row = $dbw->selectRow(
                        array( 'revision', 'page' ),
  
                if ( $row ) {
                        if ( wfTimestamp( TS_MW, $row->rev_timestamp ) >= $revision->getTimestamp() ) {
 -                              wfProfileOut( __METHOD__ );
                                return false;
                        }
                        $prev = $row->rev_id;
  
                $ret = $this->updateRevisionOn( $dbw, $revision, $prev, $lastRevIsRedirect );
  
 -              wfProfileOut( __METHOD__ );
                return $ret;
        }
  
         */
        public function replaceSectionContent( $sectionId, Content $sectionContent, $sectionTitle = '',
                $edittime = null ) {
 -              wfProfileIn( __METHOD__ );
  
                $baseRevId = null;
                if ( $edittime && $sectionId !== 'new' ) {
                        }
                }
  
 -              wfProfileOut( __METHOD__ );
                return $this->replaceSectionAtRev( $sectionId, $sectionContent, $sectionTitle, $baseRevId );
        }
  
        public function replaceSectionAtRev( $sectionId, Content $sectionContent,
                $sectionTitle = '', $baseRevId = null
        ) {
 -              wfProfileIn( __METHOD__ );
  
                if ( strval( $sectionId ) === '' ) {
                        // Whole-page edit; let the whole text through
                        $newContent = $sectionContent;
                } else {
                        if ( !$this->supportsSections() ) {
 -                              wfProfileOut( __METHOD__ );
                                throw new MWException( "sections not supported for content model " .
                                        $this->getContentHandler()->getModelID() );
                        }
                                if ( !$rev ) {
                                        wfDebug( __METHOD__ . " asked for bogus section (page: " .
                                                $this->getId() . "; section: $sectionId)\n" );
 -                                      wfProfileOut( __METHOD__ );
                                        return null;
                                }
  
  
                        if ( !$oldContent ) {
                                wfDebug( __METHOD__ . ": no page text\n" );
 -                              wfProfileOut( __METHOD__ );
                                return null;
                        }
  
                        $newContent = $oldContent->replaceSection( $sectionId, $sectionContent, $sectionTitle );
                }
  
 -              wfProfileOut( __METHOD__ );
                return $newContent;
        }
  
                        throw new MWException( 'Something is trying to edit an article with an empty title' );
                }
  
 -              wfProfileIn( __METHOD__ );
 -
                if ( !$content->getContentHandler()->canBeUsedOn( $this->getTitle() ) ) {
 -                      wfProfileOut( __METHOD__ );
                        return Status::newFatal( 'content-not-allowed-here',
                                ContentHandler::getLocalizedName( $content->getModel() ),
                                $this->getTitle()->getPrefixedText() );
                                $status->fatal( 'edit-hook-aborted' );
                        }
  
 -                      wfProfileOut( __METHOD__ );
                        return $status;
                }
  
                                wfDebug( __METHOD__ . ": EDIT_UPDATE specified but article doesn't exist\n" );
                                $status->fatal( 'edit-gone-missing' );
  
 -                              wfProfileOut( __METHOD__ );
                                return $status;
                        } elseif ( !$old_content ) {
                                // Sanity check for bug 37225
 -                              wfProfileOut( __METHOD__ );
                                throw new MWException( "Could not find text for current revision {$oldid}." );
                        }
  
                                        if ( !$status->isOK() ) {
                                                $dbw->rollback( __METHOD__ );
  
 -                                              wfProfileOut( __METHOD__ );
                                                return $status;
                                        }
                                        $revisionId = $revision->insertOn( $dbw );
  
                                                $dbw->rollback( __METHOD__ );
  
 -                                              wfProfileOut( __METHOD__ );
                                                return $status;
                                        }
  
                                                }
                                        }
                                        $user->incEditCount();
 -                              } catch ( MWException $e ) {
 +                              } catch ( Exception $e ) {
                                        $dbw->rollback( __METHOD__ );
                                        // Question: Would it perhaps be better if this method turned all
                                        // exceptions into $status's?
                                if ( !$status->isOK() ) {
                                        $dbw->rollback( __METHOD__ );
  
 -                                      wfProfileOut( __METHOD__ );
                                        return $status;
                                }
  
                                        $dbw->rollback( __METHOD__ );
                                        $status->fatal( 'edit-already-exists' );
  
 -                                      wfProfileOut( __METHOD__ );
                                        return $status;
                                }
  
                                }
                                $user->incEditCount();
  
 -                      } catch ( MWException $e ) {
 +                      } catch ( Exception $e ) {
                                $dbw->rollback( __METHOD__ );
                                throw $e;
                        }
                        $user->addAutopromoteOnceGroups( 'onEdit' );
                } );
  
 -              wfProfileOut( __METHOD__ );
                return $status;
        }
  
         * Returns a stdClass with source, pst and output members
         *
         * @param Content $content
 -       * @param int|null $revid
 +       * @param Revision|int|null $revision Revision object. For backwards compatibility, a
 +       *        revision ID is also accepted, but this is deprecated.
         * @param User|null $user
         * @param string|null $serialFormat
         * @param bool $useCache Check shared prepared edit cache
         * @since 1.21
         */
        public function prepareContentForEdit(
 -              Content $content, $revid = null, User $user = null, $serialFormat = null, $useCache = true
 +              Content $content, $revision = null, User $user = null, $serialFormat = null, $useCache = true
        ) {
 -              global $wgContLang, $wgUser;
 +              global $wgContLang, $wgUser, $wgAjaxEditStash;
 +
 +              if ( is_object( $revision ) ) {
 +                      $revid = $revision->getId();
 +              } else {
 +                      $revid = $revision;
 +                      // This code path is deprecated, and nothing is known to
 +                      // use it, so performance here shouldn't be a worry.
 +                      if ( $revid !== null ) {
 +                              $revision = Revision::newFromId( $revid, Revision::READ_LATEST );
 +                      } else {
 +                              $revision = null;
 +                      }
 +              }
  
                $user = is_null( $user ) ? $wgUser : $user;
                //XXX: check $user->getId() here???
                }
  
                // The edit may have already been prepared via api.php?action=stashedit
 -              $cachedEdit = $useCache
 +              $cachedEdit = $useCache && $wgAjaxEditStash
                        ? ApiStashEdit::checkCache( $this->getTitle(), $content, $user )
                        : false;
  
                if ( $cachedEdit ) {
                        $edit->output = $cachedEdit->output;
                } else {
 +                      if ( $revision ) {
 +                              // We get here if vary-revision is set. This means that this page references
 +                              // itself (such as via self-transclusion). In this case, we need to make sure
 +                              // that any such self-references refer to the newly-saved revision, and not
 +                              // to the previous one, which could otherwise happen due to slave lag.
 +                              $oldCallback = $edit->popts->setCurrentRevisionCallback(
 +                                      function ( $title, $parser = false ) use ( $revision, &$oldCallback ) {
 +                                              if ( $title->equals( $revision->getTitle() ) ) {
 +                                                      return $revision;
 +                                              } else {
 +                                                      return call_user_func(
 +                                                              $oldCallback,
 +                                                              $title,
 +                                                              $parser
 +                                                      );
 +                                              }
 +                                      }
 +                              );
 +                      }
                        $edit->output = $edit->pstContent
                                ? $edit->pstContent->getParserOutput( $this->mTitle, $revid, $edit->popts )
                                : null;
         * - changed: boolean, whether the revision changed the content (default true)
         * - created: boolean, whether the revision created the page (default false)
         * - moved: boolean, whether the page was moved (default false)
-        * - oldcountable: boolean or null (default null):
+        * - oldcountable: boolean, null, or string 'no-change' (default null):
         *   - boolean: whether the page was counted as an article before that
         *     revision, only used in changed is true and created is false
-        *   - null: don't change the article count
+        *   - null: if created is false, don't update the article count; if created
+        *     is true, do update the article count
+        *   - 'no-change': don't update the article count, ever
         */
        public function doEditUpdates( Revision $revision, User $user, array $options = array() ) {
                global $wgEnableParserCache;
  
 -              wfProfileIn( __METHOD__ );
 -
                $options += array(
                        'changed' => true,
                        'created' => false,
                // already pre-save transformed once.
                if ( !$this->mPreparedEdit || $this->mPreparedEdit->output->getFlag( 'vary-revision' ) ) {
                        wfDebug( __METHOD__ . ": No prepared edit or vary-revision is set...\n" );
 -                      $editInfo = $this->prepareContentForEdit( $content, $revision->getId(), $user );
 +                      $editInfo = $this->prepareContentForEdit( $content, $revision, $user );
                } else {
                        wfDebug( __METHOD__ . ": No vary-revision, using prepared edit...\n" );
                        $editInfo = $this->mPreparedEdit;
                Hooks::run( 'ArticleEditUpdates', array( &$this, &$editInfo, $options['changed'] ) );
  
                if ( Hooks::run( 'ArticleEditUpdatesDeleteFromRecentchanges', array( &$this ) ) ) {
 -                      if ( 0 == mt_rand( 0, 99 ) ) {
 -                              // Flush old entries from the `recentchanges` table; we do this on
 -                              // random requests so as to avoid an increase in writes for no good reason
 -                              RecentChange::purgeExpiredChanges();
 -                      }
 +                      // Flush old entries from the `recentchanges` table
 +                      JobQueueGroup::singleton()->push( RecentChangesUpdateJob::newPurgeJob() );
                }
  
                if ( !$this->exists() ) {
 -                      wfProfileOut( __METHOD__ );
                        return;
                }
  
                $title = $this->mTitle->getPrefixedDBkey();
                $shortTitle = $this->mTitle->getDBkey();
  
-               if ( !$options['changed'] && !$options['moved'] ) {
+               if ( $options['oldcountable'] === 'no-change' ||
+                       ( !$options['changed'] && !$options['moved'] )
+               ) {
                        $good = 0;
                } elseif ( $options['created'] ) {
                        $good = (int)$this->isCountable( $editInfo );
                        self::onArticleEdit( $this->mTitle );
                }
  
 -              wfProfileOut( __METHOD__ );
        }
  
        /**
        public function doQuickEditContent( Content $content, User $user, $comment = '', $minor = false,
                $serialFormat = null
        ) {
 -              wfProfileIn( __METHOD__ );
  
                $serialized = $content->serialize( $serialFormat );
  
  
                Hooks::run( 'NewRevisionFromEditComplete', array( $this, $revision, false, $user ) );
  
 -              wfProfileOut( __METHOD__ );
        }
  
        /**
  
                // Get the last edit not by this guy...
                // Note: these may not be public values
 -              $user = intval( $current->getRawUser() );
 -              $user_text = $dbw->addQuotes( $current->getRawUserText() );
 +              $user = intval( $current->getUser( Revision::RAW ) );
 +              $user_text = $dbw->addQuotes( $current->getUserText( Revision::RAW ) );
                $s = $dbw->selectRow( 'revision',
                        array( 'rev_id', 'rev_timestamp', 'rev_deleted' ),
                        array( 'rev_page' => $current->getPage(),
         *
         * @param Title $title
         */
 -      public static function onArticleCreate( $title ) {
 +      public static function onArticleCreate( Title $title ) {
                // Update existence markers on article/talk tabs...
                $other = $title->getOtherPage();
  
         *
         * @param Title $title
         */
 -      public static function onArticleDelete( $title ) {
 +      public static function onArticleDelete( Title $title ) {
                // Update existence markers on article/talk tabs...
                $other = $title->getOtherPage();
  
         * Purge caches on page update etc
         *
         * @param Title $title
 -       * @todo Verify that $title is always a Title object (and never false or
 -       *   null), add Title hint to parameter $title.
         */
 -      public static function onArticleEdit( $title ) {
 +      public static function onArticleEdit( Title $title ) {
                // Invalidate caches of articles which include this page
                DeferredUpdates::addHTMLCacheUpdate( $title, 'templatelinks' );