includes/content/WikiTextStructure.php

   1 <?php
   2
   3 use HtmlFormatter\HtmlFormatter;
   4 use MediaWiki\Logger\LoggerFactory;
   5
   6 /**
   7  * Class allowing to explore structure of parsed wikitext.
   8  */
   9 class WikiTextStructure {
  10         /**
  11          * @var string
  12          */
  13         private $openingText;
  14         /**
  15          * @var string
  16          */
  17         private $allText;
  18         /**
  19          * @var string[]
  20          */
  21         private $auxText = [];
  22         /**
  23          * @var ParserOutput
  24          */
  25         private $parserOutput;
  26
  27         /**
  28          * @var string[] selectors to elements that are excluded entirely from search
  29          */
  30         private $excludedElementSelectors = [
  31                 'audio', 'video',       // "it looks like you don't have javascript enabled..."
  32                                         // do not need to index
  33                 'sup.reference',        // The [1] for references
  34                 '.mw-cite-backlink',    // The ↑ next to references in the references section
  35                 'h1', 'h2', 'h3',       // Headings are already indexed in their own field.
  36                 'h5', 'h6', 'h4',
  37                 '.autocollapse',        // Collapsed fields are hidden by default so we don't want them
  38                                                                 // showing up.
  39         ];
  40
  41         /**
  42          * @var string[] selectors to elements that are considered auxiliary to article text for search
  43          */
  44         private $auxiliaryElementSelectors = [
  45                 '.thumbcaption',        // Thumbnail captions aren't really part of the text proper
  46                 'table',                // Neither are tables
  47                 '.rellink',             // Common style for "See also:".
  48                 '.dablink',             // Common style for calling out helpful links at the top
  49                                                                 // of the article.
  50                 '.searchaux',           // New class users can use to mark stuff as auxiliary to searches.
  51         ];
  52
  53         /**
  54          * WikiTextStructure constructor.
  55          * @param ParserOutput $parserOutput
  56          */
  57         public function __construct( ParserOutput $parserOutput ) {
  58                 $this->parserOutput = $parserOutput;
  59         }
  60
  61         /**
  62          * Get categories in the text.
  63          * @return string[]
  64          */
  65         public function categories() {
  66                 $categories = [];
  67                 foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) {
  68                         $categories[] = Category::newFromName( $key )->getTitle()->getText();
  69                 }
  70                 return $categories;
  71         }
  72
  73         /**
  74          * Get outgoing links.
  75          * @return string[]
  76          */
  77         public function outgoingLinks() {
  78                 $outgoingLinks = [];
  79                 foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) {
  80                         foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) {
  81                                 $outgoingLinks[] =
  82                                         Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey();
  83                         }
  84                 }
  85                 return $outgoingLinks;
  86         }
  87
  88         /**
  89          * Get templates in the text.
  90          * @return string[]
  91          */
  92         public function templates() {
  93                 $templates = [];
  94                 foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) {
  95                         foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
  96                                 $templateTitle = Title::makeTitleSafe( $tNS, $tDbKey );
  97                                 if ( $templateTitle && $templateTitle->exists() ) {
  98                                         $templates[] = $templateTitle->getPrefixedText();
  99                                 }
 100                         }
 101                 }
 102                 return $templates;
 103         }
 104
 105         /**
 106          * Get headings on the page.
 107          * @return string[]
 108          * First strip out things that look like references.  We can't use HTML filtering because
 109          * the references come back as <sup> tags without a class.  To keep from breaking stuff like
 110          *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
 111          * we don't remove the whole <sup> tag.  We also don't want to strip the <sup> tag and remove
 112          * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
 113          * or something.  Whatever.  So we only strip things that look like <sup> tags wrapping a
 114          * reference.  And since the data looks like:
 115          *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
 116          * we can not really use HtmlFormatter as we have no suitable selector.
 117          */
 118         public function headings() {
 119                 $headings = [];
 120                 $ignoredHeadings = $this->getIgnoredHeadings();
 121                 foreach ( $this->parserOutput->getSections() as $heading ) {
 122                         $heading = $heading[ 'line' ];
 123
 124                         // Some wikis wrap the brackets in a span:
 125                         // http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
 126                         $heading = preg_replace( '/<\/?span>/', '', $heading );
 127                         // Normalize [] so the following regexp would work.
 128                         $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
 129                         $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
 130
 131                         // Strip tags from the heading or else we'll display them (escaped) in search results
 132                         $heading = trim( Sanitizer::stripAllTags( $heading ) );
 133
 134                         // Note that we don't take the level of the heading into account - all headings are equal.
 135                         // Except the ones we ignore.
 136                         if ( !in_array( $heading, $ignoredHeadings ) ) {
 137                                 $headings[] = $heading;
 138                         }
 139                 }
 140                 return $headings;
 141         }
 142
 143         /**
 144          * Parse a message content into an array. This function is generally used to
 145          * parse settings stored as i18n messages (see search-ignored-headings).
 146          *
 147          * @param string $message
 148          * @return string[]
 149          */
 150         public static function parseSettingsInMessage( $message ) {
 151                 $lines = explode( "\n", $message );
 152                 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
 153                 $lines = array_map( 'trim', $lines );          // Remove extra spaces
 154                 $lines = array_filter( $lines );               // Remove empty lines
 155                 return $lines;
 156         }
 157
 158         /**
 159          * Get list of heading to ignore.
 160          * @return string[]
 161          */
 162         private function getIgnoredHeadings() {
 163                 static $ignoredHeadings = null;
 164                 if ( $ignoredHeadings === null ) {
 165                         $ignoredHeadings = [];
 166                         $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
 167                         if ( $source->isBlank() ) {
 168                                 // Try old version too, just in case
 169                                 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
 170                         }
 171                         if ( !$source->isDisabled() ) {
 172                                 $lines = self::parseSettingsInMessage( $source->plain() );
 173                                 $ignoredHeadings = $lines;               // Now we just have headings!
 174                         }
 175                 }
 176                 return $ignoredHeadings;
 177         }
 178
 179         /**
 180          * Extract parts of the text - opening, main and auxiliary.
 181          */
 182         private function extractWikitextParts() {
 183                 if ( !is_null( $this->allText ) ) {
 184                         return;
 185                 }
 186                 $this->parserOutput->setEditSectionTokens( false );
 187                 $this->parserOutput->setTOCEnabled( false );
 188                 $text = $this->parserOutput->getText();
 189                 if ( strlen( $text ) == 0 ) {
 190                         $this->allText = "";
 191                         // empty text - nothing to seek here
 192                         return;
 193                 }
 194                 $opening = null;
 195
 196                 $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
 197
 198                 // Add extra spacing around break tags so text crammed together like<br>this
 199                 // doesn't make one word.
 200                 $text = str_replace( '<br', "\n<br", $text );
 201
 202                 $formatter = new HtmlFormatter( $text );
 203
 204                 // Strip elements from the page that we never want in the search text.
 205                 $formatter->remove( $this->excludedElementSelectors );
 206                 $formatter->filterContent();
 207
 208                 // Strip elements from the page that are auxiliary text.  These will still be
 209                 // searched but matches will be ranked lower and non-auxiliary matches will be
 210                 // preferred in highlighting.
 211                 $formatter->remove( $this->auxiliaryElementSelectors );
 212                 $auxiliaryElements = $formatter->filterContent();
 213                 $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 214                 foreach ( $auxiliaryElements as $auxiliaryElement ) {
 215                         $this->auxText[] =
 216                                 trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
 217                 }
 218         }
 219
 220         /**
 221          * Get text before first heading.
 222          * @param string $text
 223          * @return string|null
 224          */
 225         private function extractHeadingBeforeFirstHeading( $text ) {
 226                 $matches = [];
 227                 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
 228                         // There isn't a first heading so we interpret this as the article
 229                         // being entirely without heading.
 230                         return null;
 231                 }
 232                 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
 233                 if ( !$text ) {
 234                         // There isn't any text before the first heading so we declare there isn't
 235                         // a first heading.
 236                         return null;
 237                 }
 238
 239                 $formatter = new HtmlFormatter( $text );
 240                 $formatter->remove( $this->excludedElementSelectors );
 241                 $formatter->remove( $this->auxiliaryElementSelectors );
 242                 $formatter->filterContent();
 243                 $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 244
 245                 if ( !$text ) {
 246                         // There isn't any text after filtering before the first heading so we declare
 247                         // that there isn't a first heading.
 248                         return null;
 249                 }
 250
 251                 return $text;
 252         }
 253
 254         /**
 255          * Get opening text
 256          * @return string
 257          */
 258         public function getOpeningText() {
 259                 $this->extractWikitextParts();
 260                 return $this->openingText;
 261         }
 262
 263         /**
 264          * Get main text
 265          * @return string
 266          */
 267         public function getMainText() {
 268                 $this->extractWikitextParts();
 269                 return $this->allText;
 270         }
 271
 272         /**
 273          * Get auxiliary text
 274          * @return string[]
 275          */
 276         public function getAuxiliaryText() {
 277                 $this->extractWikitextParts();
 278                 return $this->auxText;
 279         }
 280 }