includes/site/MediaWikiPageNameNormalizer.php

   1 <?php
   2
   3 namespace MediaWiki\Site;
   4
   5 use FormatJson;
   6 use Http;
   7 use UtfNormal\Validator;
   8
   9 /**
  10  * Service for normalizing a page name using a MediaWiki api.
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * This program is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20  * GNU General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU General Public License along
  23  * with this program; if not, write to the Free Software Foundation, Inc.,
  24  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  25  * http://www.gnu.org/copyleft/gpl.html
  26  *
  27  * @since 1.27
  28  *
  29  * @license GNU GPL v2+
  30  * @author John Erling Blad < jeblad@gmail.com >
  31  * @author Daniel Kinzler
  32  * @author Jeroen De Dauw < jeroendedauw@gmail.com >
  33  * @author Marius Hoch
  34  */
  35 class MediaWikiPageNameNormalizer {
  36
  37         /**
  38          * @var Http
  39          */
  40         private $http;
  41
  42         /**
  43          * @param Http|null $http
  44          */
  45         public function __construct( Http $http = null ) {
  46                 if ( !$http ) {
  47                         $http = new Http();
  48                 }
  49
  50                 $this->http = $http;
  51         }
  52
  53         /**
  54          * Returns the normalized form of the given page title, using the
  55          * normalization rules of the given site. If the given title is a redirect,
  56          * the redirect weill be resolved and the redirect target is returned.
  57          *
  58          * @note This actually makes an API request to the remote site, so beware
  59          *   that this function is slow and depends on an external service.
  60          *
  61          * @see Site::normalizePageName
  62          *
  63          * @since 1.27
  64          *
  65          * @param string $pageName
  66          * @param string $apiUrl
  67          *
  68          * @return string
  69          * @throws \MWException
  70          */
  71         public function normalizePageName( $pageName, $apiUrl ) {
  72                 // Check if we have strings as arguments.
  73                 if ( !is_string( $pageName ) ) {
  74                         throw new \MWException( '$pageName must be a string' );
  75                 }
  76
  77                 // Go on call the external site
  78
  79                 // Make sure the string is normalized into NFC (due to T42017)
  80                 // but do nothing to the whitespaces, that should work appropriately.
  81                 // @see https://phabricator.wikimedia.org/T42017
  82                 $pageName = Validator::cleanUp( $pageName );
  83
  84                 // Build the args for the specific call
  85                 $args = [
  86                         'action' => 'query',
  87                         'prop' => 'info',
  88                         'redirects' => true,
  89                         'converttitles' => true,
  90                         'format' => 'json',
  91                         'titles' => $pageName,
  92                         // @todo options for maxlag and maxage
  93                         // Note that maxlag will lead to a long delay before a reply is made,
  94                         // but that maxage can avoid the extreme delay. On the other hand
  95                         // maxage could be nice to use anyhow as it stops unnecessary requests.
  96                         // Also consider smaxage if maxage is used.
  97                 ];
  98
  99                 $url = wfAppendQuery( $apiUrl, $args );
 100
 101                 // Go on call the external site
 102                 // @todo we need a good way to specify a timeout here.
 103                 $ret = $this->http->get( $url, [], __METHOD__ );
 104
 105                 if ( $ret === false ) {
 106                         wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
 107                         return false;
 108                 }
 109
 110                 $data = FormatJson::decode( $ret, true );
 111
 112                 if ( !is_array( $data ) ) {
 113                         wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
 114                         return false;
 115                 }
 116
 117                 $page = static::extractPageRecord( $data, $pageName );
 118
 119                 if ( isset( $page['missing'] ) ) {
 120                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
 121                                 . $ret );
 122                         return false;
 123                 }
 124
 125                 if ( isset( $page['invalid'] ) ) {
 126                         wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
 127                                 . $ret );
 128                         return false;
 129                 }
 130
 131                 if ( !isset( $page['title'] ) ) {
 132                         wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
 133                         return false;
 134                 }
 135
 136                 return $page['title'];
 137         }
 138
 139         /**
 140          * Get normalization record for a given page title from an API response.
 141          *
 142          * @param array $externalData A reply from the API on a external server.
 143          * @param string $pageTitle Identifies the page at the external site, needing normalization.
 144          *
 145          * @return array|bool A 'page' structure representing the page identified by $pageTitle.
 146          */
 147         private static function extractPageRecord( $externalData, $pageTitle ) {
 148                 // If there is a special case with only one returned page
 149                 // we can cheat, and only return
 150                 // the single page in the "pages" substructure.
 151                 if ( isset( $externalData['query']['pages'] ) ) {
 152                         $pages = array_values( $externalData['query']['pages'] );
 153                         if ( count( $pages ) === 1 ) {
 154                                 return $pages[0];
 155                         }
 156                 }
 157                 // This is only used during internal testing, as it is assumed
 158                 // a more optimal (and lossfree) storage.
 159                 // Make initial checks and return if prerequisites are not meet.
 160                 if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
 161                         return false;
 162                 }
 163                 // Loop over the tree different named structures, that otherwise are similar
 164                 $structs = [
 165                         'normalized' => 'from',
 166                         'converted' => 'from',
 167                         'redirects' => 'from',
 168                         'pages' => 'title'
 169                 ];
 170                 foreach ( $structs as $listId => $fieldId ) {
 171                         // Check if the substructure exist at all.
 172                         if ( !isset( $externalData['query'][$listId] ) ) {
 173                                 continue;
 174                         }
 175                         // Filter the substructure down to what we actually are using.
 176                         $collectedHits = array_filter(
 177                                 array_values( $externalData['query'][$listId] ),
 178                                 function ( $a ) use ( $fieldId, $pageTitle ) {
 179                                         return $a[$fieldId] === $pageTitle;
 180                                 }
 181                         );
 182                         // If still looping over normalization, conversion or redirects,
 183                         // then we need to keep the new page title for later rounds.
 184                         if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
 185                                 switch ( count( $collectedHits ) ) {
 186                                         case 0:
 187                                                 break;
 188                                         case 1:
 189                                                 $pageTitle = $collectedHits[0]['to'];
 190                                                 break;
 191                                         default:
 192                                                 return false;
 193                                 }
 194                         } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
 195                                 // If on the pages structure we should prepare for returning.
 196
 197                                 switch ( count( $collectedHits ) ) {
 198                                         case 0:
 199                                                 return false;
 200                                         case 1:
 201                                                 return array_shift( $collectedHits );
 202                                         default:
 203                                                 return false;
 204                                 }
 205                         }
 206                 }
 207                 // should never be here
 208                 return false;
 209         }
 210
 211 }