parserOutput = $parserOutput; } /** * Get categories in the text. * @return string[] */ public function categories() { $categories = []; foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) { $categories[] = Category::newFromName( $key )->getTitle()->getText(); } return $categories; } /** * Get outgoing links. * @return string[] */ public function outgoingLinks() { $outgoingLinks = []; foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) { foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) { $outgoingLinks[] = Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey(); } } return $outgoingLinks; } /** * Get templates in the text. * @return string[] */ public function templates() { $templates = []; foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) { foreach ( array_keys( $templatesInNS ) as $tDbKey ) { $templateTitle = Title::makeTitleSafe( $tNS, $tDbKey ); if ( $templateTitle && $templateTitle->exists() ) { $templates[] = $templateTitle->getPrefixedText(); } } } return $templates; } /** * Get headings on the page. * @return string[] * First strip out things that look like references. We can't use HTML filtering because * the references come back as ^{tags without a class. To keep from breaking stuff like
* ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''²==
* we don't remove the whole ^{tag. We also don't want to strip the ^{tag and remove
* everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
* or something. Whatever. So we only strip things that look like ^{tags wrapping a
* reference. And since the data looks like:
* Reference in heading ^[1]^[2]
* we can not really use HtmlFormatter as we have no suitable selector.
*/
public function headings() {
$headings = [];
$ignoredHeadings = $this->getIgnoredHeadings();
foreach ( $this->parserOutput->getSections() as $heading ) {
$heading = $heading[ 'line' ];

// Some wikis wrap the brackets in a span:
// http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
$heading = preg_replace( '/<\/?span>/', '', $heading );
// Normalize [] so the following regexp would work.
$heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading );
$heading = preg_replace( '/^{\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );

// Strip tags from the heading or else we'll display them (escaped) in search results
$heading = trim( Sanitizer::stripAllTags( $heading ) );

// Note that we don't take the level of the heading into account - all headings are equal.
// Except the ones we ignore.
if ( !in_array( $heading, $ignoredHeadings ) ) {
$headings[] = $heading;
}
}
return $headings;
}

/**
* Parse a message content into an array. This function is generally used to
* parse settings stored as i18n messages (see search-ignored-headings).
*
* @param string $message
* @return string[]
*/
public static function parseSettingsInMessage( $message ) {
$lines = explode( "\n", $message );
$lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
$lines = array_map( 'trim', $lines ); // Remove extra spaces
$lines = array_filter( $lines ); // Remove empty lines
return $lines;
}

/**
* Get list of heading to ignore.
* @return string[]
*/
private function getIgnoredHeadings() {
static $ignoredHeadings = null;
if ( $ignoredHeadings === null ) {
$ignoredHeadings = [];
$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
if ( $source->isBlank() ) {
// Try old version too, just in case
$source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
}
if ( !$source->isDisabled() ) {
$lines = self::parseSettingsInMessage( $source->plain() );
$ignoredHeadings = $lines; // Now we just have headings!
}
}
return $ignoredHeadings;
}

/**
* Extract parts of the text - opening, main and auxiliary.
*/
private function extractWikitextParts() {
if ( !is_null( $this->allText ) ) {
return;
}
$this->parserOutput->setEditSectionTokens( false );
$this->parserOutput->setTOCEnabled( false );
$text = $this->parserOutput->getText();
if ( strlen( $text ) == 0 ) {
$this->allText = "";
// empty text - nothing to seek here
return;
}
$opening = null;

$this->openingText = $this->extractHeadingBeforeFirstHeading( $text );

// Add extra spacing around break tags so text crammed together like
this
// doesn't make one word.
$text = str_replace( 'remove( $this->excludedElementSelectors );
$formatter->filterContent();

// Strip elements from the page that are auxiliary text. These will still be
// searched but matches will be ranked lower and non-auxiliary matches will be
// preferred in highlighting.
$formatter->remove( $this->auxiliaryElementSelectors );
$auxiliaryElements = $formatter->filterContent();
$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
foreach ( $auxiliaryElements as $auxiliaryElement ) {
$this->auxText[] =
trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
}
}

/**
* Get text before first heading.
* @param string $text
* @return string|null
*/
private function extractHeadingBeforeFirstHeading( $text ) {
$matches = [];
if ( !preg_match( '//', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
// There isn't a first heading so we interpret this as the article
// being entirely without heading.
return null;
}
$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
if ( !$text ) {
// There isn't any text before the first heading so we declare there isn't
// a first heading.
return null;
}

$formatter = new HtmlFormatter( $text );
$formatter->remove( $this->excludedElementSelectors );
$formatter->remove( $this->auxiliaryElementSelectors );
$formatter->filterContent();
$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );

if ( !$text ) {
// There isn't any text after filtering before the first heading so we declare
// that there isn't a first heading.
return null;
}

return $text;
}

/**
* Get opening text
* @return string
*/
public function getOpeningText() {
$this->extractWikitextParts();
return $this->openingText;
}

/**
* Get main text
* @return string
*/
public function getMainText() {
$this->extractWikitextParts();
return $this->allText;
}

/**
* Get auxiliary text
* @return string[]
*/
public function getAuxiliaryText() {
$this->extractWikitextParts();
return $this->auxText;
}
}}}}}}