From 4c574daa454d19547c099dd0e92894c1470b35c8 Mon Sep 17 00:00:00 2001 From: Marius Hoch Date: Sun, 20 Dec 2015 22:00:20 +0100 Subject: [PATCH] Factor page name normalization out of MediaWikiSite Into a new MediaWikiPageNameNormalizer. The code has been copied over almost 1:1, I only left the phpunit test special handling in MediaWiki site. Change-Id: I008cadd29a2aa1f21098339b895c35a100959b04 --- autoload.php | 1 + includes/site/MediaWikiPageNameNormalizer.php | 196 ++++++++++++++++++ includes/site/MediaWikiSite.php | 146 +------------ .../site/MediaWikiPageNameNormalizerTest.php | 85 ++++++++ 4 files changed, 292 insertions(+), 136 deletions(-) create mode 100644 includes/site/MediaWikiPageNameNormalizer.php create mode 100644 tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php diff --git a/autoload.php b/autoload.php index 46da116d63..6083ce2c81 100644 --- a/autoload.php +++ b/autoload.php @@ -777,6 +777,7 @@ $wgAutoloadLocalClasses = array( 'MediaWiki\\Logger\\Monolog\\WikiProcessor' => __DIR__ . '/includes/debug/logger/monolog/WikiProcessor.php', 'MediaWiki\\Logger\\NullSpi' => __DIR__ . '/includes/debug/logger/NullSpi.php', 'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php', + 'MediaWiki\\Site\\MediaWikiPageNameNormalizer' => __DIR__ . '/includes/site/MediaWikiPageNameNormalizer.php', 'MediaWiki\\Tidy\\Html5Depurate' => __DIR__ . '/includes/tidy/Html5Depurate.php', 'MediaWiki\\Tidy\\RaggettBase' => __DIR__ . '/includes/tidy/RaggettBase.php', 'MediaWiki\\Tidy\\RaggettExternal' => __DIR__ . '/includes/tidy/RaggettExternal.php', diff --git a/includes/site/MediaWikiPageNameNormalizer.php b/includes/site/MediaWikiPageNameNormalizer.php new file mode 100644 index 0000000000..f358bd4e69 --- /dev/null +++ b/includes/site/MediaWikiPageNameNormalizer.php @@ -0,0 +1,196 @@ + + * @author Daniel Kinzler + * @author Jeroen De Dauw < jeroendedauw@gmail.com > + * @author Marius Hoch + */ +class MediaWikiPageNameNormalizer { + + /** + * Returns the normalized form of the given page title, using the + * normalization rules of the given site. If the given title is a redirect, + * the redirect weill be resolved and the redirect target is returned. + * + * @note This actually makes an API request to the remote site, so beware + * that this function is slow and depends on an external service. + * + * @see Site::normalizePageName + * + * @since 1.27 + * + * @param string $pageName + * @param string $apiUrl + * + * @return string + * @throws \MWException + */ + public function normalizePageName( $pageName, $apiUrl ) { + + // Check if we have strings as arguments. + if ( !is_string( $pageName ) ) { + throw new \MWException( '$pageName must be a string' ); + } + + // Go on call the external site + + // Make sure the string is normalized into NFC (due to T42017) + // but do nothing to the whitespaces, that should work appropriately. + // @see https://phabricator.wikimedia.org/T42017 + $pageName = Validator::cleanUp( $pageName ); + + // Build the args for the specific call + $args = array( + 'action' => 'query', + 'prop' => 'info', + 'redirects' => true, + 'converttitles' => true, + 'format' => 'json', + 'titles' => $pageName, + // @todo options for maxlag and maxage + // Note that maxlag will lead to a long delay before a reply is made, + // but that maxage can avoid the extreme delay. On the other hand + // maxage could be nice to use anyhow as it stops unnecessary requests. + // Also consider smaxage if maxage is used. + ); + + $url = wfAppendQuery( $apiUrl, $args ); + + // Go on call the external site + // @todo we need a good way to specify a timeout here. + $ret = Http::get( $url, array(), __METHOD__ ); + + if ( $ret === false ) { + wfDebugLog( "MediaWikiSite", "call to external site failed: $url" ); + return false; + } + + $data = FormatJson::decode( $ret, true ); + + if ( !is_array( $data ) ) { + wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret ); + return false; + } + + $page = static::extractPageRecord( $data, $pageName ); + + if ( isset( $page['missing'] ) ) { + wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! " + . $ret ); + return false; + } + + if ( isset( $page['invalid'] ) ) { + wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! " + . $ret ); + return false; + } + + if ( !isset( $page['title'] ) ) { + wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret ); + return false; + } + + return $page['title']; + } + + /** + * Get normalization record for a given page title from an API response. + * + * @param array $externalData A reply from the API on a external server. + * @param string $pageTitle Identifies the page at the external site, needing normalization. + * + * @return array|bool A 'page' structure representing the page identified by $pageTitle. + */ + private static function extractPageRecord( $externalData, $pageTitle ) { + // If there is a special case with only one returned page + // we can cheat, and only return + // the single page in the "pages" substructure. + if ( isset( $externalData['query']['pages'] ) ) { + $pages = array_values( $externalData['query']['pages'] ); + if ( count( $pages ) === 1 ) { + return $pages[0]; + } + } + // This is only used during internal testing, as it is assumed + // a more optimal (and lossfree) storage. + // Make initial checks and return if prerequisites are not meet. + if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) { + return false; + } + // Loop over the tree different named structures, that otherwise are similar + $structs = array( + 'normalized' => 'from', + 'converted' => 'from', + 'redirects' => 'from', + 'pages' => 'title' + ); + foreach ( $structs as $listId => $fieldId ) { + // Check if the substructure exist at all. + if ( !isset( $externalData['query'][$listId] ) ) { + continue; + } + // Filter the substructure down to what we actually are using. + $collectedHits = array_filter( + array_values( $externalData['query'][$listId] ), + function ( $a ) use ( $fieldId, $pageTitle ) { + return $a[$fieldId] === $pageTitle; + } + ); + // If still looping over normalization, conversion or redirects, + // then we need to keep the new page title for later rounds. + if ( $fieldId === 'from' && is_array( $collectedHits ) ) { + switch ( count( $collectedHits ) ) { + case 0: + break; + case 1: + $pageTitle = $collectedHits[0]['to']; + break; + default: + return false; + } + } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) { + // If on the pages structure we should prepare for returning. + + switch ( count( $collectedHits ) ) { + case 0: + return false; + case 1: + return array_shift( $collectedHits ); + default: + return false; + } + } + } + // should never be here + return false; + } + +} diff --git a/includes/site/MediaWikiSite.php b/includes/site/MediaWikiSite.php index 9fec1f4fe1..0f7e5d7e7c 100644 --- a/includes/site/MediaWikiSite.php +++ b/includes/site/MediaWikiSite.php @@ -1,4 +1,7 @@ getPrefixedText(); } else { + static $mediaWikiPageNameNormalizer = null; - // Make sure the string is normalized into NFC (due to T42017) - // but do nothing to the whitespaces, that should work appropriately. - // @see https://phabricator.wikimedia.org/T42017 - $pageName = UtfNormal\Validator::cleanUp( $pageName ); - - // Build the args for the specific call - $args = array( - 'action' => 'query', - 'prop' => 'info', - 'redirects' => true, - 'converttitles' => true, - 'format' => 'json', - 'titles' => $pageName, - // @todo options for maxlag and maxage - // Note that maxlag will lead to a long delay before a reply is made, - // but that maxage can avoid the extreme delay. On the other hand - // maxage could be nice to use anyhow as it stops unnecessary requests. - // Also consider smaxage if maxage is used. - ); - - $url = wfAppendQuery( $this->getFileUrl( 'api.php' ), $args ); - - // Go on call the external site - // @todo we need a good way to specify a timeout here. - $ret = Http::get( $url, array(), __METHOD__ ); - } - - if ( $ret === false ) { - wfDebugLog( "MediaWikiSite", "call to external site failed: $url" ); - return false; - } - - $data = FormatJson::decode( $ret, true ); - - if ( !is_array( $data ) ) { - wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret ); - return false; - } - - $page = static::extractPageRecord( $data, $pageName ); - - if ( isset( $page['missing'] ) ) { - wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! " - . $ret ); - return false; - } - - if ( isset( $page['invalid'] ) ) { - wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! " - . $ret ); - return false; - } - - if ( !isset( $page['title'] ) ) { - wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret ); - return false; - } - - return $page['title']; - } - - /** - * Get normalization record for a given page title from an API response. - * - * @since 1.21 - * - * @param array $externalData A reply from the API on a external server. - * @param string $pageTitle Identifies the page at the external site, needing normalization. - * - * @return array|bool A 'page' structure representing the page identified by $pageTitle. - */ - private static function extractPageRecord( $externalData, $pageTitle ) { - // If there is a special case with only one returned page - // we can cheat, and only return - // the single page in the "pages" substructure. - if ( isset( $externalData['query']['pages'] ) ) { - $pages = array_values( $externalData['query']['pages'] ); - if ( count( $pages ) === 1 ) { - return $pages[0]; - } - } - // This is only used during internal testing, as it is assumed - // a more optimal (and lossfree) storage. - // Make initial checks and return if prerequisites are not meet. - if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) { - return false; - } - // Loop over the tree different named structures, that otherwise are similar - $structs = array( - 'normalized' => 'from', - 'converted' => 'from', - 'redirects' => 'from', - 'pages' => 'title' - ); - foreach ( $structs as $listId => $fieldId ) { - // Check if the substructure exist at all. - if ( !isset( $externalData['query'][$listId] ) ) { - continue; + if ( $mediaWikiPageNameNormalizer === null ) { + $mediaWikiPageNameNormalizer = new MediaWikiPageNameNormalizer(); } - // Filter the substructure down to what we actually are using. - $collectedHits = array_filter( - array_values( $externalData['query'][$listId] ), - function ( $a ) use ( $fieldId, $pageTitle ) { - return $a[$fieldId] === $pageTitle; - } + + return $mediaWikiPageNameNormalizer->normalizePageName( + $pageName, + $this->getFileUrl( 'api.php' ) ); - // If still looping over normalization, conversion or redirects, - // then we need to keep the new page title for later rounds. - if ( $fieldId === 'from' && is_array( $collectedHits ) ) { - switch ( count( $collectedHits ) ) { - case 0: - break; - case 1: - $pageTitle = $collectedHits[0]['to']; - break; - default: - return false; - } - } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) { - // If on the pages structure we should prepare for returning. - switch ( count( $collectedHits ) ) { - case 0: - return false; - case 1: - return array_shift( $collectedHits ); - default: - return false; - } - } } - // should never be here - return false; } /** diff --git a/tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php b/tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php new file mode 100644 index 0000000000..163c52d016 --- /dev/null +++ b/tests/phpunit/includes/site/MediaWikiPageNameNormalizerTest.php @@ -0,0 +1,85 @@ + 3 ), + __METHOD__ + ); + + if ( $res === false || strpos( $res, '"sitename":"Wikidata"' ) === false ) { + $connectivity = false; + } else { + $connectivity = true; + } + } + + if ( !$connectivity ) { + $this->markTestSkipped( 'MediaWikiPageNameNormalizerTest needs internet connectivity.' ); + } + } + + /** + * @dataProvider normalizePageTitleProvider + */ + public function testNormalizePageTitle( $expected, $pageName ) { + $normalizer = new MediaWikiPageNameNormalizer(); + + $this->assertSame( + $expected, + $normalizer->normalizePageName( $pageName, 'https://www.wikidata.org/w/api.php' ) + ); + } + + public function normalizePageTitleProvider() { + // Note: This makes (very conservative) assumptions about pages on Wikidata + // existing or not. + return array( + 'universe (Q1)' => array( + 'Q1', 'Q1' + ), + 'Q404 redirects to Q395' => array( + 'Q395', 'Q404' + ), + 'there is no Q0' => array( + false, 'Q0' + ) + ); + } + +} -- 2.20.1