3 use HtmlFormatter\HtmlFormatter
;
4 use MediaWiki\Logger\LoggerFactory
;
7 * Class allowing to explore structure of parsed wikitext.
9 class WikiTextStructure
{
21 private $auxText = [];
25 private $parserOutput;
28 * @var string[] selectors to elements that are excluded entirely from search
30 private $excludedElementSelectors = [
31 'audio', 'video', // "it looks like you don't have javascript enabled..."
32 // do not need to index
33 'sup.reference', // The [1] for references
34 '.mw-cite-backlink', // The ↑ next to references in the references section
35 'h1', 'h2', 'h3', // Headings are already indexed in their own field.
37 '.autocollapse', // Collapsed fields are hidden by default so we don't want them
42 * @var string[] selectors to elements that are considered auxiliary to article text for search
44 private $auxiliaryElementSelectors = [
45 '.thumbcaption', // Thumbnail captions aren't really part of the text proper
46 'table', // Neither are tables
47 '.rellink', // Common style for "See also:".
48 '.dablink', // Common style for calling out helpful links at the top
50 '.searchaux', // New class users can use to mark stuff as auxiliary to searches.
54 * WikiTextStructure constructor.
55 * @param ParserOutput $parserOutput
57 public function __construct( ParserOutput
$parserOutput ) {
58 $this->parserOutput
= $parserOutput;
62 * Get headings on the page.
64 * First strip out things that look like references. We can't use HTML filtering because
65 * the references come back as <sup> tags without a class. To keep from breaking stuff like
66 * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
67 * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
68 * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
69 * or something. Whatever. So we only strip things that look like <sup> tags wrapping a
70 * reference. And since the data looks like:
71 * Reference in heading <sup>[1]</sup><sup>[2]</sup>
72 * we can not really use HtmlFormatter as we have no suitable selector.
74 public function headings() {
76 $ignoredHeadings = $this->getIgnoredHeadings();
77 foreach ( $this->parserOutput
->getSections() as $heading ) {
78 $heading = $heading[ 'line' ];
80 // Some wikis wrap the brackets in a span:
81 // http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
82 $heading = preg_replace( '/<\/?span>/', '', $heading );
83 // Normalize [] so the following regexp would work.
84 $heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading );
85 $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
87 // Strip tags from the heading or else we'll display them (escaped) in search results
88 $heading = trim( Sanitizer
::stripAllTags( $heading ) );
90 // Note that we don't take the level of the heading into account - all headings are equal.
91 // Except the ones we ignore.
92 if ( !in_array( $heading, $ignoredHeadings ) ) {
93 $headings[] = $heading;
100 * Parse a message content into an array. This function is generally used to
101 * parse settings stored as i18n messages (see search-ignored-headings).
103 * @param string $message
106 public static function parseSettingsInMessage( $message ) {
107 $lines = explode( "\n", $message );
108 $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
109 $lines = array_map( 'trim', $lines ); // Remove extra spaces
110 $lines = array_filter( $lines ); // Remove empty lines
115 * Get list of heading to ignore.
118 private function getIgnoredHeadings() {
119 static $ignoredHeadings = null;
120 if ( $ignoredHeadings === null ) {
121 $ignoredHeadings = [];
122 $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
123 if ( $source->isBlank() ) {
124 // Try old version too, just in case
125 $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
127 if ( !$source->isDisabled() ) {
128 $lines = self
::parseSettingsInMessage( $source->plain() );
129 $ignoredHeadings = $lines; // Now we just have headings!
132 return $ignoredHeadings;
136 * Extract parts of the text - opening, main and auxiliary.
138 private function extractWikitextParts() {
139 if ( !is_null( $this->allText
) ) {
142 $this->parserOutput
->setEditSectionTokens( false );
143 $this->parserOutput
->setTOCEnabled( false );
144 $text = $this->parserOutput
->getText();
145 if ( strlen( $text ) == 0 ) {
147 // empty text - nothing to seek here
152 $this->openingText
= $this->extractHeadingBeforeFirstHeading( $text );
154 // Add extra spacing around break tags so text crammed together like<br>this
155 // doesn't make one word.
156 $text = str_replace( '<br', "\n<br", $text );
158 $formatter = new HtmlFormatter( $text );
160 // Strip elements from the page that we never want in the search text.
161 $formatter->remove( $this->excludedElementSelectors
);
162 $formatter->filterContent();
164 // Strip elements from the page that are auxiliary text. These will still be
165 // searched but matches will be ranked lower and non-auxiliary matches will be
166 // preferred in highlighting.
167 $formatter->remove( $this->auxiliaryElementSelectors
);
168 $auxiliaryElements = $formatter->filterContent();
169 $this->allText
= trim( Sanitizer
::stripAllTags( $formatter->getText() ) );
170 foreach ( $auxiliaryElements as $auxiliaryElement ) {
172 trim( Sanitizer
::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
177 * Get text before first heading.
178 * @param string $text
179 * @return string|null
181 private function extractHeadingBeforeFirstHeading( $text ) {
183 if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE
) ) {
184 // There isn't a first heading so we interpret this as the article
185 // being entirely without heading.
188 $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
190 // There isn't any text before the first heading so we declare there isn't
195 $formatter = new HtmlFormatter( $text );
196 $formatter->remove( $this->excludedElementSelectors
);
197 $formatter->remove( $this->auxiliaryElementSelectors
);
198 $formatter->filterContent();
199 $text = trim( Sanitizer
::stripAllTags( $formatter->getText() ) );
202 // There isn't any text after filtering before the first heading so we declare
203 // that there isn't a first heading.
214 public function getOpeningText() {
215 $this->extractWikitextParts();
216 return $this->openingText
;
223 public function getMainText() {
224 $this->extractWikitextParts();
225 return $this->allText
;
232 public function getAuxiliaryText() {
233 $this->extractWikitextParts();
234 return $this->auxText
;