ba203cacfad8ce8387213168095cecc132a5b681
[lhc/web/wiklou.git] / includes / search / SearchEngine.php
1 <?php
2 /**
3 * Basic search engine
4 *
5 * @file
6 * @ingroup Search
7 */
8
9 /**
10 * @defgroup Search Search
11 */
12
13 /**
14 * Contain a class for special pages
15 * @ingroup Search
16 */
17 class SearchEngine {
18 var $limit = 10;
19 var $offset = 0;
20 var $prefix = '';
21 var $searchTerms = array();
22 var $namespaces = array( NS_MAIN );
23 var $showRedirects = false;
24
25 /**
26 * @var DatabaseBase
27 */
28 protected $db;
29
30 function __construct($db = null) {
31 if ( $db ) {
32 $this->db = $db;
33 } else {
34 $this->db = wfGetDB( DB_SLAVE );
35 }
36 }
37
38 /**
39 * Perform a full text search query and return a result set.
40 * If title searches are not supported or disabled, return null.
41 * STUB
42 *
43 * @param $term String: raw search term
44 * @return SearchResultSet
45 */
46 function searchText( $term ) {
47 return null;
48 }
49
50 /**
51 * Perform a title-only search query and return a result set.
52 * If title searches are not supported or disabled, return null.
53 * STUB
54 *
55 * @param $term String: raw search term
56 * @return SearchResultSet
57 */
58 function searchTitle( $term ) {
59 return null;
60 }
61
62 /** If this search backend can list/unlist redirects */
63 function acceptListRedirects() {
64 return true;
65 }
66
67 /**
68 * When overridden in derived class, performs database-specific conversions
69 * on text to be used for searching or updating search index.
70 * Default implementation does nothing (simply returns $string).
71 *
72 * @param $string string: String to process
73 * @return string
74 */
75 public function normalizeText( $string ) {
76 global $wgContLang;
77
78 // Some languages such as Chinese require word segmentation
79 return $wgContLang->segmentByWord( $string );
80 }
81
82 /**
83 * Transform search term in cases when parts of the query came as different GET params (when supported)
84 * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
85 */
86 function transformSearchTerm( $term ) {
87 return $term;
88 }
89
90 /**
91 * If an exact title match can be found, or a very slightly close match,
92 * return the title. If no match, returns NULL.
93 *
94 * @param $searchterm String
95 * @return Title
96 */
97 public static function getNearMatch( $searchterm ) {
98 $title = self::getNearMatchInternal( $searchterm );
99
100 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
101 return $title;
102 }
103
104 /**
105 * Do a near match (see SearchEngine::getNearMatch) and wrap it into a
106 * SearchResultSet.
107 *
108 * @param $searchterm string
109 * @return SearchResultSet
110 */
111 public static function getNearMatchResultSet( $searchterm ) {
112 return new SearchNearMatchResultSet( self::getNearMatch( $searchterm ) );
113 }
114
115 /**
116 * Really find the title match.
117 */
118 private static function getNearMatchInternal( $searchterm ) {
119 global $wgContLang;
120
121 $allSearchTerms = array( $searchterm );
122
123 if ( $wgContLang->hasVariants() ) {
124 $allSearchTerms = array_merge( $allSearchTerms, $wgContLang->autoConvertToAllVariants( $searchterm ) );
125 }
126
127 $titleResult = null;
128 if ( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
129 return $titleResult;
130 }
131
132 $context = new RequestContext;
133
134 foreach ( $allSearchTerms as $term ) {
135
136 # Exact match? No need to look further.
137 $title = Title::newFromText( $term );
138 if ( is_null( $title ) ){
139 return null;
140 }
141
142 if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
143 return $title;
144 }
145
146 # See if it still otherwise has content is some sane sense
147 $context->setTitle( $title );
148 $article = MediaWiki::articleFromTitle( $title, $context );
149 if ( $article->hasViewableContent() ) {
150 return $title;
151 }
152
153 # Now try all lower case (i.e. first letter capitalized)
154 #
155 $title = Title::newFromText( $wgContLang->lc( $term ) );
156 if ( $title && $title->exists() ) {
157 return $title;
158 }
159
160 # Now try capitalized string
161 #
162 $title = Title::newFromText( $wgContLang->ucwords( $term ) );
163 if ( $title && $title->exists() ) {
164 return $title;
165 }
166
167 # Now try all upper case
168 #
169 $title = Title::newFromText( $wgContLang->uc( $term ) );
170 if ( $title && $title->exists() ) {
171 return $title;
172 }
173
174 # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
175 $title = Title::newFromText( $wgContLang->ucwordbreaks( $term ) );
176 if ( $title && $title->exists() ) {
177 return $title;
178 }
179
180 // Give hooks a chance at better match variants
181 $title = null;
182 if ( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
183 return $title;
184 }
185 }
186
187 $title = Title::newFromText( $searchterm );
188
189 # Entering an IP address goes to the contributions page
190 if ( ( $title->getNamespace() == NS_USER && User::isIP( $title->getText() ) )
191 || User::isIP( trim( $searchterm ) ) ) {
192 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
193 }
194
195
196 # Entering a user goes to the user page whether it's there or not
197 if ( $title->getNamespace() == NS_USER ) {
198 return $title;
199 }
200
201 # Go to images that exist even if there's no local page.
202 # There may have been a funny upload, or it may be on a shared
203 # file repository such as Wikimedia Commons.
204 if ( $title->getNamespace() == NS_FILE ) {
205 $image = wfFindFile( $title );
206 if ( $image ) {
207 return $title;
208 }
209 }
210
211 # MediaWiki namespace? Page may be "implied" if not customized.
212 # Just return it, with caps forced as the message system likes it.
213 if ( $title->getNamespace() == NS_MEDIAWIKI ) {
214 return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
215 }
216
217 # Quoted term? Try without the quotes...
218 $matches = array();
219 if ( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
220 return SearchEngine::getNearMatch( $matches[1] );
221 }
222
223 return null;
224 }
225
226 public static function legalSearchChars() {
227 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
228 }
229
230 /**
231 * Set the maximum number of results to return
232 * and how many to skip before returning the first.
233 *
234 * @param $limit Integer
235 * @param $offset Integer
236 */
237 function setLimitOffset( $limit, $offset = 0 ) {
238 $this->limit = intval( $limit );
239 $this->offset = intval( $offset );
240 }
241
242 /**
243 * Set which namespaces the search should include.
244 * Give an array of namespace index numbers.
245 *
246 * @param $namespaces Array
247 */
248 function setNamespaces( $namespaces ) {
249 $this->namespaces = $namespaces;
250 }
251
252 /**
253 * Parse some common prefixes: all (search everything)
254 * or namespace names
255 *
256 * @param $query String
257 */
258 function replacePrefixes( $query ) {
259 global $wgContLang;
260
261 $parsed = $query;
262 if ( strpos( $query, ':' ) === false ) { // nothing to do
263 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
264 return $parsed;
265 }
266
267 $allkeyword = wfMsgForContent( 'searchall' ) . ":";
268 if ( strncmp( $query, $allkeyword, strlen( $allkeyword ) ) == 0 ) {
269 $this->namespaces = null;
270 $parsed = substr( $query, strlen( $allkeyword ) );
271 } else if ( strpos( $query, ':' ) !== false ) {
272 $prefix = substr( $query, 0, strpos( $query, ':' ) );
273 $index = $wgContLang->getNsIndex( $prefix );
274 if ( $index !== false ) {
275 $this->namespaces = array( $index );
276 $parsed = substr( $query, strlen( $prefix ) + 1 );
277 }
278 }
279 if ( trim( $parsed ) == '' )
280 $parsed = $query; // prefix was the whole query
281
282 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
283
284 return $parsed;
285 }
286
287 /**
288 * Make a list of searchable namespaces and their canonical names.
289 * @return Array
290 */
291 public static function searchableNamespaces() {
292 global $wgContLang;
293 $arr = array();
294 foreach ( $wgContLang->getNamespaces() as $ns => $name ) {
295 if ( $ns >= NS_MAIN ) {
296 $arr[$ns] = $name;
297 }
298 }
299
300 wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
301 return $arr;
302 }
303
304 /**
305 * Extract default namespaces to search from the given user's
306 * settings, returning a list of index numbers.
307 *
308 * @param $user User
309 * @return Array
310 */
311 public static function userNamespaces( $user ) {
312 global $wgSearchEverythingOnlyLoggedIn;
313
314 // get search everything preference, that can be set to be read for logged-in users
315 $searcheverything = false;
316 if ( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() )
317 || !$wgSearchEverythingOnlyLoggedIn )
318 $searcheverything = $user->getOption( 'searcheverything' );
319
320 // searcheverything overrides other options
321 if ( $searcheverything )
322 return array_keys( SearchEngine::searchableNamespaces() );
323
324 $arr = Preferences::loadOldSearchNs( $user );
325 $searchableNamespaces = SearchEngine::searchableNamespaces();
326
327 $arr = array_intersect( $arr, array_keys( $searchableNamespaces ) ); // Filter
328
329 return $arr;
330 }
331
332 /**
333 * Find snippet highlight settings for a given user
334 *
335 * @param $user User
336 * @return Array contextlines, contextchars
337 */
338 public static function userHighlightPrefs( &$user ) {
339 // $contextlines = $user->getOption( 'contextlines', 5 );
340 // $contextchars = $user->getOption( 'contextchars', 50 );
341 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
342 $contextchars = 75; // same as above.... :P
343 return array( $contextlines, $contextchars );
344 }
345
346 /**
347 * An array of namespaces indexes to be searched by default
348 *
349 * @return Array
350 */
351 public static function defaultNamespaces() {
352 global $wgNamespacesToBeSearchedDefault;
353
354 return array_keys( $wgNamespacesToBeSearchedDefault, true );
355 }
356
357 /**
358 * Get a list of namespace names useful for showing in tooltips
359 * and preferences
360 *
361 * @param $namespaces Array
362 */
363 public static function namespacesAsText( $namespaces ) {
364 global $wgContLang;
365
366 $formatted = array_map( array( $wgContLang, 'getFormattedNsText' ), $namespaces );
367 foreach ( $formatted as $key => $ns ) {
368 if ( empty( $ns ) )
369 $formatted[$key] = wfMsg( 'blanknamespace' );
370 }
371 return $formatted;
372 }
373
374 /**
375 * Return the help namespaces to be shown on Special:Search
376 *
377 * @return Array
378 */
379 public static function helpNamespaces() {
380 global $wgNamespacesToBeSearchedHelp;
381
382 return array_keys( $wgNamespacesToBeSearchedHelp, true );
383 }
384
385 /**
386 * Return a 'cleaned up' search string
387 *
388 * @param $text String
389 * @return String
390 */
391 function filter( $text ) {
392 $lc = $this->legalSearchChars();
393 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
394 }
395 /**
396 * Load up the appropriate search engine class for the currently
397 * active database backend, and return a configured instance.
398 *
399 * @return SearchEngine
400 */
401 public static function create() {
402 global $wgSearchType;
403 $dbr = null;
404 if ( $wgSearchType ) {
405 $class = $wgSearchType;
406 } else {
407 $dbr = wfGetDB( DB_SLAVE );
408 $class = $dbr->getSearchEngine();
409 }
410 $search = new $class( $dbr );
411 $search->setLimitOffset( 0, 0 );
412 return $search;
413 }
414
415 /**
416 * Create or update the search index record for the given page.
417 * Title and text should be pre-processed.
418 * STUB
419 *
420 * @param $id Integer
421 * @param $title String
422 * @param $text String
423 */
424 function update( $id, $title, $text ) {
425 // no-op
426 }
427
428 /**
429 * Update a search index record's title only.
430 * Title should be pre-processed.
431 * STUB
432 *
433 * @param $id Integer
434 * @param $title String
435 */
436 function updateTitle( $id, $title ) {
437 // no-op
438 }
439
440 /**
441 * Get OpenSearch suggestion template
442 *
443 * @return String
444 */
445 public static function getOpenSearchTemplate() {
446 global $wgOpenSearchTemplate, $wgServer;
447 if ( $wgOpenSearchTemplate ) {
448 return $wgOpenSearchTemplate;
449 } else {
450 $ns = implode( '|', SearchEngine::defaultNamespaces() );
451 if ( !$ns ) $ns = "0";
452 return $wgServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace=' . $ns;
453 }
454 }
455
456 /**
457 * Get internal MediaWiki Suggest template
458 *
459 * @return String
460 */
461 public static function getMWSuggestTemplate() {
462 global $wgMWSuggestTemplate, $wgServer;
463 if ( $wgMWSuggestTemplate )
464 return $wgMWSuggestTemplate;
465 else
466 return $wgServer . wfScript( 'api' ) . '?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
467 }
468 }
469
470 /**
471 * @ingroup Search
472 */
473 class SearchResultSet {
474 /**
475 * Fetch an array of regular expression fragments for matching
476 * the search terms as parsed by this engine in a text extract.
477 * STUB
478 *
479 * @return Array
480 */
481 function termMatches() {
482 return array();
483 }
484
485 function numRows() {
486 return 0;
487 }
488
489 /**
490 * Return true if results are included in this result set.
491 * STUB
492 *
493 * @return Boolean
494 */
495 function hasResults() {
496 return false;
497 }
498
499 /**
500 * Some search modes return a total hit count for the query
501 * in the entire article database. This may include pages
502 * in namespaces that would not be matched on the given
503 * settings.
504 *
505 * Return null if no total hits number is supported.
506 *
507 * @return Integer
508 */
509 function getTotalHits() {
510 return null;
511 }
512
513 /**
514 * Some search modes return a suggested alternate term if there are
515 * no exact hits. Returns true if there is one on this set.
516 *
517 * @return Boolean
518 */
519 function hasSuggestion() {
520 return false;
521 }
522
523 /**
524 * @return String: suggested query, null if none
525 */
526 function getSuggestionQuery() {
527 return null;
528 }
529
530 /**
531 * @return String: HTML highlighted suggested query, '' if none
532 */
533 function getSuggestionSnippet() {
534 return '';
535 }
536
537 /**
538 * Return information about how and from where the results were fetched,
539 * should be useful for diagnostics and debugging
540 *
541 * @return String
542 */
543 function getInfo() {
544 return null;
545 }
546
547 /**
548 * Return a result set of hits on other (multiple) wikis associated with this one
549 *
550 * @return SearchResultSet
551 */
552 function getInterwikiResults() {
553 return null;
554 }
555
556 /**
557 * Check if there are results on other wikis
558 *
559 * @return Boolean
560 */
561 function hasInterwikiResults() {
562 return $this->getInterwikiResults() != null;
563 }
564
565 /**
566 * Fetches next search result, or false.
567 * STUB
568 *
569 * @return SearchResult
570 */
571 function next() {
572 return false;
573 }
574
575 /**
576 * Frees the result set, if applicable.
577 */
578 function free() {
579 // ...
580 }
581 }
582
583 /**
584 * This class is used for different SQL-based search engines shipped with MediaWiki
585 */
586 class SqlSearchResultSet extends SearchResultSet {
587 function __construct( $resultSet, $terms ) {
588 $this->mResultSet = $resultSet;
589 $this->mTerms = $terms;
590 }
591
592 function termMatches() {
593 return $this->mTerms;
594 }
595
596 function numRows() {
597 if ( $this->mResultSet === false )
598 return false;
599
600 return $this->mResultSet->numRows();
601 }
602
603 function next() {
604 if ( $this->mResultSet === false )
605 return false;
606
607 $row = $this->mResultSet->fetchObject();
608 if ( $row === false )
609 return false;
610
611 return SearchResult::newFromRow( $row );
612 }
613
614 function free() {
615 if ( $this->mResultSet === false )
616 return false;
617
618 $this->mResultSet->free();
619 }
620 }
621
622 /**
623 * @ingroup Search
624 */
625 class SearchResultTooMany {
626 # # Some search engines may bail out if too many matches are found
627 }
628
629
630 /**
631 * @todo Fixme: This class is horribly factored. It would probably be better to
632 * have a useful base class to which you pass some standard information, then
633 * let the fancy self-highlighters extend that.
634 * @ingroup Search
635 */
636 class SearchResult {
637
638 /**
639 * @var Revision
640 */
641 var $mRevision = null;
642 var $mImage = null;
643
644 /**
645 * @var Title
646 */
647 var $mTitle;
648
649 /**
650 * @var String
651 */
652 var $mText;
653
654 /**
655 * Return a new SearchResult and initializes it with a title.
656 *
657 * @param $title Title
658 * @return SearchResult
659 */
660 public static function newFromTitle( $title ) {
661 $result = new self();
662 $result->initFromTitle( $title );
663 return $result;
664 }
665 /**
666 * Return a new SearchResult and initializes it with a row.
667 *
668 * @param $row object
669 * @return SearchResult
670 */
671 public static function newFromRow( $row ) {
672 $result = new self();
673 $result->initFromRow( $row );
674 return $result;
675 }
676
677 public function __construct( $row = null ) {
678 if ( !is_null( $row ) ) {
679 // Backwards compatibility with pre-1.17 callers
680 $this->initFromRow( $row );
681 }
682 }
683
684 /**
685 * Initialize from a database row. Makes a Title and passes that to
686 * initFromTitle.
687 *
688 * @param $row object
689 */
690 protected function initFromRow( $row ) {
691 $this->initFromTitle( Title::makeTitle( $row->page_namespace, $row->page_title ) );
692 }
693
694 /**
695 * Initialize from a Title and if possible initializes a corresponding
696 * Revision and File.
697 *
698 * @param $title Title
699 */
700 protected function initFromTitle( $title ) {
701 $this->mTitle = $title;
702 if ( !is_null( $this->mTitle ) ) {
703 $this->mRevision = Revision::newFromTitle( $this->mTitle );
704 if ( $this->mTitle->getNamespace() === NS_FILE )
705 $this->mImage = wfFindFile( $this->mTitle );
706 }
707 }
708
709 /**
710 * Check if this is result points to an invalid title
711 *
712 * @return Boolean
713 */
714 function isBrokenTitle() {
715 if ( is_null( $this->mTitle ) )
716 return true;
717 return false;
718 }
719
720 /**
721 * Check if target page is missing, happens when index is out of date
722 *
723 * @return Boolean
724 */
725 function isMissingRevision() {
726 return !$this->mRevision && !$this->mImage;
727 }
728
729 /**
730 * @return Title
731 */
732 function getTitle() {
733 return $this->mTitle;
734 }
735
736 /**
737 * @return Double or null if not supported
738 */
739 function getScore() {
740 return null;
741 }
742
743 /**
744 * Lazy initialization of article text from DB
745 */
746 protected function initText() {
747 if ( !isset( $this->mText ) ) {
748 if ( $this->mRevision != null )
749 $this->mText = $this->mRevision->getText();
750 else // TODO: can we fetch raw wikitext for commons images?
751 $this->mText = '';
752
753 }
754 }
755
756 /**
757 * @param $terms Array: terms to highlight
758 * @return String: highlighted text snippet, null (and not '') if not supported
759 */
760 function getTextSnippet( $terms ) {
761 global $wgUser, $wgAdvancedSearchHighlighting;
762 $this->initText();
763 list( $contextlines, $contextchars ) = SearchEngine::userHighlightPrefs( $wgUser );
764 $h = new SearchHighlighter();
765 if ( $wgAdvancedSearchHighlighting )
766 return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
767 else
768 return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
769 }
770
771 /**
772 * @param $terms Array: terms to highlight
773 * @return String: highlighted title, '' if not supported
774 */
775 function getTitleSnippet( $terms ) {
776 return '';
777 }
778
779 /**
780 * @param $terms Array: terms to highlight
781 * @return String: highlighted redirect name (redirect to this page), '' if none or not supported
782 */
783 function getRedirectSnippet( $terms ) {
784 return '';
785 }
786
787 /**
788 * @return Title object for the redirect to this page, null if none or not supported
789 */
790 function getRedirectTitle() {
791 return null;
792 }
793
794 /**
795 * @return string highlighted relevant section name, null if none or not supported
796 */
797 function getSectionSnippet() {
798 return '';
799 }
800
801 /**
802 * @return Title object (pagename+fragment) for the section, null if none or not supported
803 */
804 function getSectionTitle() {
805 return null;
806 }
807
808 /**
809 * @return String: timestamp
810 */
811 function getTimestamp() {
812 if ( $this->mRevision )
813 return $this->mRevision->getTimestamp();
814 else if ( $this->mImage )
815 return $this->mImage->getTimestamp();
816 return '';
817 }
818
819 /**
820 * @return Integer: number of words
821 */
822 function getWordCount() {
823 $this->initText();
824 return str_word_count( $this->mText );
825 }
826
827 /**
828 * @return Integer: size in bytes
829 */
830 function getByteSize() {
831 $this->initText();
832 return strlen( $this->mText );
833 }
834
835 /**
836 * @return Boolean if hit has related articles
837 */
838 function hasRelated() {
839 return false;
840 }
841
842 /**
843 * @return String: interwiki prefix of the title (return iw even if title is broken)
844 */
845 function getInterwikiPrefix() {
846 return '';
847 }
848 }
849 /**
850 * A SearchResultSet wrapper for SearchEngine::getNearMatch
851 */
852 class SearchNearMatchResultSet extends SearchResultSet {
853 private $fetched = false;
854 /**
855 * @param $match mixed Title if matched, else null
856 */
857 public function __construct( $match ) {
858 $this->result = $match;
859 }
860 public function hasResult() {
861 return (bool)$this->result;
862 }
863 public function numRows() {
864 return $this->hasResults() ? 1 : 0;
865 }
866 public function next() {
867 if ( $this->fetched || !$this->result ) {
868 return false;
869 }
870 $this->fetched = true;
871 return SearchResult::newFromTitle( $this->result );
872 }
873 }
874
875 /**
876 * Highlight bits of wikitext
877 *
878 * @ingroup Search
879 */
880 class SearchHighlighter {
881 var $mCleanWikitext = true;
882
883 function __construct( $cleanupWikitext = true ) {
884 $this->mCleanWikitext = $cleanupWikitext;
885 }
886
887 /**
888 * Default implementation of wikitext highlighting
889 *
890 * @param $text String
891 * @param $terms Array: terms to highlight (unescaped)
892 * @param $contextlines Integer
893 * @param $contextchars Integer
894 * @return String
895 */
896 public function highlightText( $text, $terms, $contextlines, $contextchars ) {
897 global $wgContLang;
898 global $wgSearchHighlightBoundaries;
899 $fname = __METHOD__;
900
901 if ( $text == '' )
902 return '';
903
904 // spli text into text + templates/links/tables
905 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
906 // first capture group is for detecting nested templates/links/tables/references
907 $endPatterns = array(
908 1 => '/(\{\{)|(\}\})/', // template
909 2 => '/(\[\[)|(\]\])/', // image
910 3 => "/(\n\\{\\|)|(\n\\|\\})/" ); // table
911
912 // FIXME: this should prolly be a hook or something
913 if ( function_exists( 'wfCite' ) ) {
914 $spat .= '|(<ref>)'; // references via cite extension
915 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
916 }
917 $spat .= '/';
918 $textExt = array(); // text extracts
919 $otherExt = array(); // other extracts
920 wfProfileIn( "$fname-split" );
921 $start = 0;
922 $textLen = strlen( $text );
923 $count = 0; // sequence number to maintain ordering
924 while ( $start < $textLen ) {
925 // find start of template/image/table
926 if ( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ) {
927 $epat = '';
928 foreach ( $matches as $key => $val ) {
929 if ( $key > 0 && $val[1] != - 1 ) {
930 if ( $key == 2 ) {
931 // see if this is an image link
932 $ns = substr( $val[0], 2, - 1 );
933 if ( $wgContLang->getNsIndex( $ns ) != NS_FILE )
934 break;
935
936 }
937 $epat = $endPatterns[$key];
938 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
939 $start = $val[1];
940 break;
941 }
942 }
943 if ( $epat ) {
944 // find end (and detect any nested elements)
945 $level = 0;
946 $offset = $start + 1;
947 $found = false;
948 while ( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ) {
949 if ( array_key_exists( 2, $endMatches ) ) {
950 // found end
951 if ( $level == 0 ) {
952 $len = strlen( $endMatches[2][0] );
953 $off = $endMatches[2][1];
954 $this->splitAndAdd( $otherExt, $count,
955 substr( $text, $start, $off + $len - $start ) );
956 $start = $off + $len;
957 $found = true;
958 break;
959 } else {
960 // end of nested element
961 $level -= 1;
962 }
963 } else {
964 // nested
965 $level += 1;
966 }
967 $offset = $endMatches[0][1] + strlen( $endMatches[0][0] );
968 }
969 if ( ! $found ) {
970 // couldn't find appropriate closing tag, skip
971 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen( $matches[0][0] ) ) );
972 $start += strlen( $matches[0][0] );
973 }
974 continue;
975 }
976 }
977 // else: add as text extract
978 $this->splitAndAdd( $textExt, $count, substr( $text, $start ) );
979 break;
980 }
981
982 $all = $textExt + $otherExt; // these have disjunct key sets
983
984 wfProfileOut( "$fname-split" );
985
986 // prepare regexps
987 foreach ( $terms as $index => $term ) {
988 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
989 if ( preg_match( '/[\x80-\xff]/', $term ) ) {
990 $terms[$index] = preg_replace_callback( '/./us', array( $this, 'caseCallback' ), $terms[$index] );
991 } else {
992 $terms[$index] = $term;
993 }
994 }
995 $anyterm = implode( '|', $terms );
996 $phrase = implode( "$wgSearchHighlightBoundaries+", $terms );
997
998 // FIXME: a hack to scale contextchars, a correct solution
999 // would be to have contextchars actually be char and not byte
1000 // length, and do proper utf-8 substrings and lengths everywhere,
1001 // but PHP is making that very hard and unclean to implement :(
1002 $scale = strlen( $anyterm ) / mb_strlen( $anyterm );
1003 $contextchars = intval( $contextchars * $scale );
1004
1005 $patPre = "(^|$wgSearchHighlightBoundaries)";
1006 $patPost = "($wgSearchHighlightBoundaries|$)";
1007
1008 $pat1 = "/(" . $phrase . ")/ui";
1009 $pat2 = "/$patPre(" . $anyterm . ")$patPost/ui";
1010
1011 wfProfileIn( "$fname-extract" );
1012
1013 $left = $contextlines;
1014
1015 $snippets = array();
1016 $offsets = array();
1017
1018 // show beginning only if it contains all words
1019 $first = 0;
1020 $firstText = '';
1021 foreach ( $textExt as $index => $line ) {
1022 if ( strlen( $line ) > 0 && $line[0] != ';' && $line[0] != ':' ) {
1023 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
1024 $first = $index;
1025 break;
1026 }
1027 }
1028 if ( $firstText ) {
1029 $succ = true;
1030 // check if first text contains all terms
1031 foreach ( $terms as $term ) {
1032 if ( ! preg_match( "/$patPre" . $term . "$patPost/ui", $firstText ) ) {
1033 $succ = false;
1034 break;
1035 }
1036 }
1037 if ( $succ ) {
1038 $snippets[$first] = $firstText;
1039 $offsets[$first] = 0;
1040 }
1041 }
1042 if ( ! $snippets ) {
1043 // match whole query on text
1044 $this->process( $pat1, $textExt, $left, $contextchars, $snippets, $offsets );
1045 // match whole query on templates/tables/images
1046 $this->process( $pat1, $otherExt, $left, $contextchars, $snippets, $offsets );
1047 // match any words on text
1048 $this->process( $pat2, $textExt, $left, $contextchars, $snippets, $offsets );
1049 // match any words on templates/tables/images
1050 $this->process( $pat2, $otherExt, $left, $contextchars, $snippets, $offsets );
1051
1052 ksort( $snippets );
1053 }
1054
1055 // add extra chars to each snippet to make snippets constant size
1056 $extended = array();
1057 if ( count( $snippets ) == 0 ) {
1058 // couldn't find the target words, just show beginning of article
1059 if ( array_key_exists( $first, $all ) ) {
1060 $targetchars = $contextchars * $contextlines;
1061 $snippets[$first] = '';
1062 $offsets[$first] = 0;
1063 }
1064 } else {
1065 // if begin of the article contains the whole phrase, show only that !!
1066 if ( array_key_exists( $first, $snippets ) && preg_match( $pat1, $snippets[$first] )
1067 && $offsets[$first] < $contextchars * 2 ) {
1068 $snippets = array ( $first => $snippets[$first] );
1069 }
1070
1071 // calc by how much to extend existing snippets
1072 $targetchars = intval( ( $contextchars * $contextlines ) / count ( $snippets ) );
1073 }
1074
1075 foreach ( $snippets as $index => $line ) {
1076 $extended[$index] = $line;
1077 $len = strlen( $line );
1078 if ( $len < $targetchars - 20 ) {
1079 // complete this line
1080 if ( $len < strlen( $all[$index] ) ) {
1081 $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index] + $targetchars, $offsets[$index] );
1082 $len = strlen( $extended[$index] );
1083 }
1084
1085 // add more lines
1086 $add = $index + 1;
1087 while ( $len < $targetchars - 20
1088 && array_key_exists( $add, $all )
1089 && !array_key_exists( $add, $snippets ) ) {
1090 $offsets[$add] = 0;
1091 $tt = "\n" . $this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
1092 $extended[$add] = $tt;
1093 $len += strlen( $tt );
1094 $add++;
1095 }
1096 }
1097 }
1098
1099 // $snippets = array_map('htmlspecialchars', $extended);
1100 $snippets = $extended;
1101 $last = - 1;
1102 $extract = '';
1103 foreach ( $snippets as $index => $line ) {
1104 if ( $last == - 1 )
1105 $extract .= $line; // first line
1106 elseif ( $last + 1 == $index && $offsets[$last] + strlen( $snippets[$last] ) >= strlen( $all[$last] ) )
1107 $extract .= " " . $line; // continous lines
1108 else
1109 $extract .= '<b> ... </b>' . $line;
1110
1111 $last = $index;
1112 }
1113 if ( $extract )
1114 $extract .= '<b> ... </b>';
1115
1116 $processed = array();
1117 foreach ( $terms as $term ) {
1118 if ( ! isset( $processed[$term] ) ) {
1119 $pat3 = "/$patPre(" . $term . ")$patPost/ui"; // highlight word
1120 $extract = preg_replace( $pat3,
1121 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
1122 $processed[$term] = true;
1123 }
1124 }
1125
1126 wfProfileOut( "$fname-extract" );
1127
1128 return $extract;
1129 }
1130
1131 /**
1132 * Split text into lines and add it to extracts array
1133 *
1134 * @param $extracts Array: index -> $line
1135 * @param $count Integer
1136 * @param $text String
1137 */
1138 function splitAndAdd( &$extracts, &$count, $text ) {
1139 $split = explode( "\n", $this->mCleanWikitext ? $this->removeWiki( $text ) : $text );
1140 foreach ( $split as $line ) {
1141 $tt = trim( $line );
1142 if ( $tt )
1143 $extracts[$count++] = $tt;
1144 }
1145 }
1146
1147 /**
1148 * Do manual case conversion for non-ascii chars
1149 *
1150 * @param $matches Array
1151 */
1152 function caseCallback( $matches ) {
1153 global $wgContLang;
1154 if ( strlen( $matches[0] ) > 1 ) {
1155 return '[' . $wgContLang->lc( $matches[0] ) . $wgContLang->uc( $matches[0] ) . ']';
1156 } else
1157 return $matches[0];
1158 }
1159
1160 /**
1161 * Extract part of the text from start to end, but by
1162 * not chopping up words
1163 * @param $text String
1164 * @param $start Integer
1165 * @param $end Integer
1166 * @param $posStart Integer: (out) actual start position
1167 * @param $posEnd Integer: (out) actual end position
1168 * @return String
1169 */
1170 function extract( $text, $start, $end, &$posStart = null, &$posEnd = null ) {
1171 if ( $start != 0 )
1172 $start = $this->position( $text, $start, 1 );
1173 if ( $end >= strlen( $text ) )
1174 $end = strlen( $text );
1175 else
1176 $end = $this->position( $text, $end );
1177
1178 if ( !is_null( $posStart ) )
1179 $posStart = $start;
1180 if ( !is_null( $posEnd ) )
1181 $posEnd = $end;
1182
1183 if ( $end > $start )
1184 return substr( $text, $start, $end - $start );
1185 else
1186 return '';
1187 }
1188
1189 /**
1190 * Find a nonletter near a point (index) in the text
1191 *
1192 * @param $text String
1193 * @param $point Integer
1194 * @param $offset Integer: offset to found index
1195 * @return Integer: nearest nonletter index, or beginning of utf8 char if none
1196 */
1197 function position( $text, $point, $offset = 0 ) {
1198 $tolerance = 10;
1199 $s = max( 0, $point - $tolerance );
1200 $l = min( strlen( $text ), $point + $tolerance ) - $s;
1201 $m = array();
1202 if ( preg_match( '/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr( $text, $s, $l ), $m, PREG_OFFSET_CAPTURE ) ) {
1203 return $m[0][1] + $s + $offset;
1204 } else {
1205 // check if point is on a valid first UTF8 char
1206 $char = ord( $text[$point] );
1207 while ( $char >= 0x80 && $char < 0xc0 ) {
1208 // skip trailing bytes
1209 $point++;
1210 if ( $point >= strlen( $text ) )
1211 return strlen( $text );
1212 $char = ord( $text[$point] );
1213 }
1214 return $point;
1215
1216 }
1217 }
1218
1219 /**
1220 * Search extracts for a pattern, and return snippets
1221 *
1222 * @param $pattern String: regexp for matching lines
1223 * @param $extracts Array: extracts to search
1224 * @param $linesleft Integer: number of extracts to make
1225 * @param $contextchars Integer: length of snippet
1226 * @param $out Array: map for highlighted snippets
1227 * @param $offsets Array: map of starting points of snippets
1228 * @protected
1229 */
1230 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ) {
1231 if ( $linesleft == 0 )
1232 return; // nothing to do
1233 foreach ( $extracts as $index => $line ) {
1234 if ( array_key_exists( $index, $out ) )
1235 continue; // this line already highlighted
1236
1237 $m = array();
1238 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1239 continue;
1240
1241 $offset = $m[0][1];
1242 $len = strlen( $m[0][0] );
1243 if ( $offset + $len < $contextchars )
1244 $begin = 0;
1245 elseif ( $len > $contextchars )
1246 $begin = $offset;
1247 else
1248 $begin = $offset + intval( ( $len - $contextchars ) / 2 );
1249
1250 $end = $begin + $contextchars;
1251
1252 $posBegin = $begin;
1253 // basic snippet from this line
1254 $out[$index] = $this->extract( $line, $begin, $end, $posBegin );
1255 $offsets[$index] = $posBegin;
1256 $linesleft--;
1257 if ( $linesleft == 0 )
1258 return;
1259 }
1260 }
1261
1262 /**
1263 * Basic wikitext removal
1264 * @protected
1265 */
1266 function removeWiki( $text ) {
1267 $fname = __METHOD__;
1268 wfProfileIn( $fname );
1269
1270 // $text = preg_replace("/'{2,5}/", "", $text);
1271 // $text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1272 // $text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1273 // $text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1274 // $text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1275 // $text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1276 $text = preg_replace( "/\\{\\{([^|]+?)\\}\\}/", "", $text );
1277 $text = preg_replace( "/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text );
1278 $text = preg_replace( "/\\[\\[([^|]+?)\\]\\]/", "\\1", $text );
1279 $text = preg_replace_callback( "/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array( $this, 'linkReplace' ), $text );
1280 // $text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1281 $text = preg_replace( "/<\/?[^>]+>/", "", $text );
1282 $text = preg_replace( "/'''''/", "", $text );
1283 $text = preg_replace( "/('''|<\/?[iIuUbB]>)/", "", $text );
1284 $text = preg_replace( "/''/", "", $text );
1285
1286 wfProfileOut( $fname );
1287 return $text;
1288 }
1289
1290 /**
1291 * callback to replace [[target|caption]] kind of links, if
1292 * the target is category or image, leave it
1293 *
1294 * @param $matches Array
1295 */
1296 function linkReplace( $matches ) {
1297 $colon = strpos( $matches[1], ':' );
1298 if ( $colon === false )
1299 return $matches[2]; // replace with caption
1300 global $wgContLang;
1301 $ns = substr( $matches[1], 0, $colon );
1302 $index = $wgContLang->getNsIndex( $ns );
1303 if ( $index !== false && ( $index == NS_FILE || $index == NS_CATEGORY ) )
1304 return $matches[0]; // return the whole thing
1305 else
1306 return $matches[2];
1307
1308 }
1309
1310 /**
1311 * Simple & fast snippet extraction, but gives completely unrelevant
1312 * snippets
1313 *
1314 * @param $text String
1315 * @param $terms Array
1316 * @param $contextlines Integer
1317 * @param $contextchars Integer
1318 * @return String
1319 */
1320 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1321 global $wgContLang;
1322 $fname = __METHOD__;
1323
1324 $lines = explode( "\n", $text );
1325
1326 $terms = implode( '|', $terms );
1327 $max = intval( $contextchars ) + 1;
1328 $pat1 = "/(.*)($terms)(.{0,$max})/i";
1329
1330 $lineno = 0;
1331
1332 $extract = "";
1333 wfProfileIn( "$fname-extract" );
1334 foreach ( $lines as $line ) {
1335 if ( 0 == $contextlines ) {
1336 break;
1337 }
1338 ++$lineno;
1339 $m = array();
1340 if ( ! preg_match( $pat1, $line, $m ) ) {
1341 continue;
1342 }
1343 --$contextlines;
1344 // truncate function changes ... to relevant i18n message.
1345 $pre = $wgContLang->truncate( $m[1], - $contextchars, '...', false );
1346
1347 if ( count( $m ) < 3 ) {
1348 $post = '';
1349 } else {
1350 $post = $wgContLang->truncate( $m[3], $contextchars, '...', false );
1351 }
1352
1353 $found = $m[2];
1354
1355 $line = htmlspecialchars( $pre . $found . $post );
1356 $pat2 = '/(' . $terms . ")/i";
1357 $line = preg_replace( $pat2,
1358 "<span class='searchmatch'>\\1</span>", $line );
1359
1360 $extract .= "${line}\n";
1361 }
1362 wfProfileOut( "$fname-extract" );
1363
1364 return $extract;
1365 }
1366
1367 }
1368
1369 /**
1370 * Dummy class to be used when non-supported Database engine is present.
1371 * @todo Fixme: dummy class should probably try something at least mildly useful,
1372 * such as a LIKE search through titles.
1373 * @ingroup Search
1374 */
1375 class SearchEngineDummy extends SearchEngine {
1376 // no-op
1377 }