3 use HtmlFormatter\HtmlFormatter
;
6 * Class allowing to explore structure of parsed wikitext.
8 class WikiTextStructure
{
20 private $auxText = [];
24 private $parserOutput;
27 * @var string[] selectors to elements that are excluded entirely from search
29 private $excludedElementSelectors = [
30 // "it looks like you don't have javascript enabled..." – do not need to index
32 // CSS stylesheets aren't content
34 // The [1] for references from Cite
36 // The ↑ next to references in the references section from Cite
38 // Headings are already indexed in their own field.
39 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
40 // Collapsed fields are hidden by default so we don't want them showing up.
42 // Content explicitly decided to be not searchable by editors such
43 // as custom navigation templates.
44 '.navigation-not-searchable',
45 // User-facing interface code prompting the user to act from WikibaseMediaInfo
46 '.wbmi-entityview-emptyCaption',
50 * @var string[] selectors to elements that are considered auxiliary to article text for search
52 private $auxiliaryElementSelectors = [
53 // Thumbnail captions aren't really part of the text proper
57 // Common style for "See also:".
59 // Common style for calling out helpful links at the top of the article.
61 // New class users can use to mark stuff as auxiliary to searches.
66 * @param ParserOutput $parserOutput
68 public function __construct( ParserOutput
$parserOutput ) {
69 $this->parserOutput
= $parserOutput;
73 * Get headings on the page.
75 * First strip out things that look like references. We can't use HTML filtering because
76 * the references come back as <sup> tags without a class. To keep from breaking stuff like
77 * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
78 * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
79 * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
80 * or something. Whatever. So we only strip things that look like <sup> tags wrapping a
81 * reference. And since the data looks like:
82 * Reference in heading <sup>[1]</sup><sup>[2]</sup>
83 * we can not really use HtmlFormatter as we have no suitable selector.
85 public function headings() {
87 $ignoredHeadings = $this->getIgnoredHeadings();
88 foreach ( $this->parserOutput
->getSections() as $heading ) {
89 $heading = $heading[ 'line' ];
91 // Some wikis wrap the brackets in a span:
92 // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
93 $heading = preg_replace( '/<\/?span>/', '', $heading );
94 // Normalize [] so the following regexp would work.
95 $heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading );
96 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
98 // Strip tags from the heading or else we'll display them (escaped) in search results
99 $heading = trim( Sanitizer
::stripAllTags( $heading ) );
101 // Note that we don't take the level of the heading into account - all headings are equal.
102 // Except the ones we ignore.
103 if ( !in_array( $heading, $ignoredHeadings ) ) {
104 $headings[] = $heading;
111 * Parse a message content into an array. This function is generally used to
112 * parse settings stored as i18n messages (see search-ignored-headings).
114 * @param string $message
117 public static function parseSettingsInMessage( $message ) {
118 $lines = explode( "\n", $message );
119 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
120 $lines = array_map( 'trim', $lines ); // Remove extra spaces
121 $lines = array_filter( $lines ); // Remove empty lines
126 * Get list of heading to ignore.
129 private function getIgnoredHeadings() {
130 static $ignoredHeadings = null;
131 if ( $ignoredHeadings === null ) {
132 $ignoredHeadings = [];
133 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
134 if ( $source->isBlank() ) {
135 // Try old version too, just in case
136 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
138 if ( !$source->isDisabled() ) {
139 $lines = self
::parseSettingsInMessage( $source->plain() );
140 $ignoredHeadings = $lines; // Now we just have headings!
143 return $ignoredHeadings;
147 * Extract parts of the text - opening, main and auxiliary.
149 private function extractWikitextParts() {
150 if ( !is_null( $this->allText
) ) {
153 $text = $this->parserOutput
->getText( [
154 'enableSectionEditTokens' => false,
157 if ( strlen( $text ) == 0 ) {
159 // empty text - nothing to seek here
164 $this->openingText
= $this->extractHeadingBeforeFirstHeading( $text );
166 $formatter = new HtmlFormatter( $text );
168 // Strip elements from the page that we never want in the search text.
169 $formatter->remove( $this->excludedElementSelectors
);
170 $formatter->filterContent();
172 // Strip elements from the page that are auxiliary text. These will still be
173 // searched but matches will be ranked lower and non-auxiliary matches will be
174 // preferred in highlighting.
175 $formatter->remove( $this->auxiliaryElementSelectors
);
176 $auxiliaryElements = $formatter->filterContent();
177 $this->allText
= trim( Sanitizer
::stripAllTags( $formatter->getText() ) );
178 foreach ( $auxiliaryElements as $auxiliaryElement ) {
180 trim( Sanitizer
::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
185 * Get text before first heading.
186 * @param string $text
187 * @return string|null
189 private function extractHeadingBeforeFirstHeading( $text ) {
191 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE
) ) {
192 // There isn't a first heading so we interpret this as the article
193 // being entirely without heading.
196 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
198 // There isn't any text before the first heading so we declare there isn't
203 $formatter = new HtmlFormatter( $text );
204 $formatter->remove( $this->excludedElementSelectors
);
205 $formatter->remove( $this->auxiliaryElementSelectors
);
206 $formatter->filterContent();
207 $text = trim( Sanitizer
::stripAllTags( $formatter->getText() ) );
210 // There isn't any text after filtering before the first heading so we declare
211 // that there isn't a first heading.
222 public function getOpeningText() {
223 $this->extractWikitextParts();
224 return $this->openingText
;
231 public function getMainText() {
232 $this->extractWikitextParts();
233 return $this->allText
;
240 public function getAuxiliaryText() {
241 $this->extractWikitextParts();
242 return $this->auxText
;
246 * Get the defaultsort property
247 * @return string|null
249 public function getDefaultSort() {
250 return $this->parserOutput
->getProperty( 'defaultsort' );