Bug 19996 (backend hooks) Finally adding the four hooks.
[lhc/web/wiklou.git] / includes / search / SearchEngine.php
1 <?php
2 /**
3 * @defgroup Search Search
4 *
5 * @file
6 * @ingroup Search
7 */
8
9 /**
10 * Contain a class for special pages
11 * @ingroup Search
12 */
13 class SearchEngine {
14 var $limit = 10;
15 var $offset = 0;
16 var $prefix = '';
17 var $searchTerms = array();
18 var $namespaces = array( NS_MAIN );
19 var $showRedirects = false;
20
21 /**
22 * Perform a full text search query and return a result set.
23 * If title searches are not supported or disabled, return null.
24 * STUB
25 *
26 * @param $term String: raw search term
27 * @return SearchResultSet
28 */
29 function searchText( $term ) {
30 return null;
31 }
32
33 /**
34 * Perform a title-only search query and return a result set.
35 * If title searches are not supported or disabled, return null.
36 * STUB
37 *
38 * @param $term String: raw search term
39 * @return SearchResultSet
40 */
41 function searchTitle( $term ) {
42 return null;
43 }
44
45 /** If this search backend can list/unlist redirects */
46 function acceptListRedirects() {
47 return true;
48 }
49
50 /**
51 * Transform search term in cases when parts of the query came as different GET params (when supported)
52 * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
53 */
54 function transformSearchTerm( $term ) {
55 return $term;
56 }
57
58 /**
59 * If an exact title match can be found, or a very slightly close match,
60 * return the title. If no match, returns NULL.
61 *
62 * @param $searchterm String
63 * @return Title
64 */
65 public static function getNearMatch( $searchterm ) {
66 $title = self::getNearMatchInternal( $searchterm );
67
68 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
69 return $title;
70 }
71
72 /**
73 * Really find the title match.
74 */
75 private static function getNearMatchInternal( $searchterm ) {
76 global $wgContLang;
77
78 $allSearchTerms = array($searchterm);
79
80 if ( $wgContLang->hasVariants() ) {
81 $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
82 }
83
84 if( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
85 return $titleResult;
86 }
87
88 foreach($allSearchTerms as $term) {
89
90 # Exact match? No need to look further.
91 $title = Title::newFromText( $term );
92 if (is_null($title))
93 return null;
94
95 if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
96 return $title;
97 }
98
99 # See if it still otherwise has content is some sane sense
100 $article = MediaWiki::articleFromTitle( $title );
101 if( $article->hasViewableContent() ) {
102 return $title;
103 }
104
105 # Now try all lower case (i.e. first letter capitalized)
106 #
107 $title = Title::newFromText( $wgContLang->lc( $term ) );
108 if ( $title && $title->exists() ) {
109 return $title;
110 }
111
112 # Now try capitalized string
113 #
114 $title = Title::newFromText( $wgContLang->ucwords( $term ) );
115 if ( $title && $title->exists() ) {
116 return $title;
117 }
118
119 # Now try all upper case
120 #
121 $title = Title::newFromText( $wgContLang->uc( $term ) );
122 if ( $title && $title->exists() ) {
123 return $title;
124 }
125
126 # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
127 $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
128 if ( $title && $title->exists() ) {
129 return $title;
130 }
131
132 // Give hooks a chance at better match variants
133 $title = null;
134 if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
135 return $title;
136 }
137 }
138
139 $title = Title::newFromText( $searchterm );
140
141 # Entering an IP address goes to the contributions page
142 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
143 || User::isIP( trim( $searchterm ) ) ) {
144 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
145 }
146
147
148 # Entering a user goes to the user page whether it's there or not
149 if ( $title->getNamespace() == NS_USER ) {
150 return $title;
151 }
152
153 # Go to images that exist even if there's no local page.
154 # There may have been a funny upload, or it may be on a shared
155 # file repository such as Wikimedia Commons.
156 if( $title->getNamespace() == NS_FILE ) {
157 $image = wfFindFile( $title );
158 if( $image ) {
159 return $title;
160 }
161 }
162
163 # MediaWiki namespace? Page may be "implied" if not customized.
164 # Just return it, with caps forced as the message system likes it.
165 if( $title->getNamespace() == NS_MEDIAWIKI ) {
166 return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
167 }
168
169 # Quoted term? Try without the quotes...
170 $matches = array();
171 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
172 return SearchEngine::getNearMatch( $matches[1] );
173 }
174
175 return null;
176 }
177
178 public static function legalSearchChars() {
179 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
180 }
181
182 /**
183 * Set the maximum number of results to return
184 * and how many to skip before returning the first.
185 *
186 * @param $limit Integer
187 * @param $offset Integer
188 */
189 function setLimitOffset( $limit, $offset = 0 ) {
190 $this->limit = intval( $limit );
191 $this->offset = intval( $offset );
192 }
193
194 /**
195 * Set which namespaces the search should include.
196 * Give an array of namespace index numbers.
197 *
198 * @param $namespaces Array
199 */
200 function setNamespaces( $namespaces ) {
201 $this->namespaces = $namespaces;
202 }
203
204 /**
205 * Parse some common prefixes: all (search everything)
206 * or namespace names
207 *
208 * @param $query String
209 */
210 function replacePrefixes( $query ){
211 global $wgContLang;
212
213 $parsed = $query;
214 if( strpos($query,':') === false ) { // nothing to do
215 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
216 return $parsed;
217 }
218
219 $allkeyword = wfMsgForContent('searchall').":";
220 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
221 $this->namespaces = null;
222 $parsed = substr($query,strlen($allkeyword));
223 } else if( strpos($query,':') !== false ) {
224 $prefix = substr($query,0,strpos($query,':'));
225 $index = $wgContLang->getNsIndex($prefix);
226 if($index !== false){
227 $this->namespaces = array($index);
228 $parsed = substr($query,strlen($prefix)+1);
229 }
230 }
231 if(trim($parsed) == '')
232 $parsed = $query; // prefix was the whole query
233
234 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
235
236 return $parsed;
237 }
238
239 /**
240 * Make a list of searchable namespaces and their canonical names.
241 * @return Array
242 */
243 public static function searchableNamespaces() {
244 global $wgContLang;
245 $arr = array();
246 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
247 if( $ns >= NS_MAIN ) {
248 $arr[$ns] = $name;
249 }
250 }
251
252 wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
253 return $arr;
254 }
255
256 /**
257 * Extract default namespaces to search from the given user's
258 * settings, returning a list of index numbers.
259 *
260 * @param $user User
261 * @return Array
262 */
263 public static function userNamespaces( $user ) {
264 global $wgSearchEverythingOnlyLoggedIn;
265
266 // get search everything preference, that can be set to be read for logged-in users
267 $searcheverything = false;
268 if( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() )
269 || !$wgSearchEverythingOnlyLoggedIn )
270 $searcheverything = $user->getOption('searcheverything');
271
272 // searcheverything overrides other options
273 if( $searcheverything )
274 return array_keys(SearchEngine::searchableNamespaces());
275
276 $arr = Preferences::loadOldSearchNs( $user );
277 $searchableNamespaces = SearchEngine::searchableNamespaces();
278
279 $arr = array_intersect( $arr, array_keys($searchableNamespaces) ); // Filter
280
281 return $arr;
282 }
283
284 /**
285 * Find snippet highlight settings for a given user
286 *
287 * @param $user User
288 * @return Array contextlines, contextchars
289 */
290 public static function userHighlightPrefs( &$user ){
291 //$contextlines = $user->getOption( 'contextlines', 5 );
292 //$contextchars = $user->getOption( 'contextchars', 50 );
293 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
294 $contextchars = 75; // same as above.... :P
295 return array($contextlines, $contextchars);
296 }
297
298 /**
299 * An array of namespaces indexes to be searched by default
300 *
301 * @return Array
302 */
303 public static function defaultNamespaces(){
304 global $wgNamespacesToBeSearchedDefault;
305
306 return array_keys($wgNamespacesToBeSearchedDefault, true);
307 }
308
309 /**
310 * Get a list of namespace names useful for showing in tooltips
311 * and preferences
312 *
313 * @param $namespaces Array
314 */
315 public static function namespacesAsText( $namespaces ){
316 global $wgContLang;
317
318 $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
319 foreach( $formatted as $key => $ns ){
320 if ( empty($ns) )
321 $formatted[$key] = wfMsg( 'blanknamespace' );
322 }
323 return $formatted;
324 }
325
326 /**
327 * Return the help namespaces to be shown on Special:Search
328 *
329 * @return Array
330 */
331 public static function helpNamespaces() {
332 global $wgNamespacesToBeSearchedHelp;
333
334 return array_keys( $wgNamespacesToBeSearchedHelp, true );
335 }
336
337 /**
338 * Return a 'cleaned up' search string
339 *
340 * @param $text String
341 * @return String
342 */
343 function filter( $text ) {
344 $lc = $this->legalSearchChars();
345 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
346 }
347 /**
348 * Load up the appropriate search engine class for the currently
349 * active database backend, and return a configured instance.
350 *
351 * @return SearchEngine
352 */
353 public static function create() {
354 global $wgSearchType;
355 $dbr = wfGetDB( DB_SLAVE );
356 if( $wgSearchType ) {
357 $class = $wgSearchType;
358 } else {
359 $class = $dbr->getSearchEngine();
360 }
361 $search = new $class( $dbr );
362 $search->setLimitOffset(0,0);
363 return $search;
364 }
365
366 /**
367 * Create or update the search index record for the given page.
368 * Title and text should be pre-processed.
369 * STUB
370 *
371 * @param $id Integer
372 * @param $title String
373 * @param $text String
374 */
375 function update( $id, $title, $text ) {
376 // no-op
377 }
378
379 /**
380 * Update a search index record's title only.
381 * Title should be pre-processed.
382 * STUB
383 *
384 * @param $id Integer
385 * @param $title String
386 */
387 function updateTitle( $id, $title ) {
388 // no-op
389 }
390
391 /**
392 * Get OpenSearch suggestion template
393 *
394 * @return String
395 */
396 public static function getOpenSearchTemplate() {
397 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
398 if( $wgOpenSearchTemplate ) {
399 return $wgOpenSearchTemplate;
400 } else {
401 $ns = implode( '|', SearchEngine::defaultNamespaces() );
402 if( !$ns ) $ns = "0";
403 return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
404 }
405 }
406
407 /**
408 * Get internal MediaWiki Suggest template
409 *
410 * @return String
411 */
412 public static function getMWSuggestTemplate() {
413 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
414 if($wgMWSuggestTemplate)
415 return $wgMWSuggestTemplate;
416 else
417 return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
418 }
419 }
420
421 /**
422 * @ingroup Search
423 */
424 class SearchResultSet {
425 /**
426 * Fetch an array of regular expression fragments for matching
427 * the search terms as parsed by this engine in a text extract.
428 * STUB
429 *
430 * @return Array
431 */
432 function termMatches() {
433 return array();
434 }
435
436 function numRows() {
437 return 0;
438 }
439
440 /**
441 * Return true if results are included in this result set.
442 * STUB
443 *
444 * @return Boolean
445 */
446 function hasResults() {
447 return false;
448 }
449
450 /**
451 * Some search modes return a total hit count for the query
452 * in the entire article database. This may include pages
453 * in namespaces that would not be matched on the given
454 * settings.
455 *
456 * Return null if no total hits number is supported.
457 *
458 * @return Integer
459 */
460 function getTotalHits() {
461 return null;
462 }
463
464 /**
465 * Some search modes return a suggested alternate term if there are
466 * no exact hits. Returns true if there is one on this set.
467 *
468 * @return Boolean
469 */
470 function hasSuggestion() {
471 return false;
472 }
473
474 /**
475 * @return String: suggested query, null if none
476 */
477 function getSuggestionQuery(){
478 return null;
479 }
480
481 /**
482 * @return String: HTML highlighted suggested query, '' if none
483 */
484 function getSuggestionSnippet(){
485 return '';
486 }
487
488 /**
489 * Return information about how and from where the results were fetched,
490 * should be useful for diagnostics and debugging
491 *
492 * @return String
493 */
494 function getInfo() {
495 return null;
496 }
497
498 /**
499 * Return a result set of hits on other (multiple) wikis associated with this one
500 *
501 * @return SearchResultSet
502 */
503 function getInterwikiResults() {
504 return null;
505 }
506
507 /**
508 * Check if there are results on other wikis
509 *
510 * @return Boolean
511 */
512 function hasInterwikiResults() {
513 return $this->getInterwikiResults() != null;
514 }
515
516
517 /**
518 * Fetches next search result, or false.
519 * STUB
520 *
521 * @return SearchResult
522 */
523 function next() {
524 return false;
525 }
526
527 /**
528 * Frees the result set, if applicable.
529 */
530 function free() {
531 // ...
532 }
533 }
534
535 /**
536 * This class is used for different SQL-based search engines shipped with MediaWiki
537 */
538 class SqlSearchResultSet extends SearchResultSet {
539 function __construct( $resultSet, $terms ) {
540 $this->mResultSet = $resultSet;
541 $this->mTerms = $terms;
542 }
543
544 function termMatches() {
545 return $this->mTerms;
546 }
547
548 function numRows() {
549 return $this->mResultSet->numRows();
550 }
551
552 function next() {
553 if ($this->mResultSet === false )
554 return false;
555
556 $row = $this->mResultSet->fetchObject();
557 if ($row === false)
558 return false;
559 return new SearchResult($row);
560 }
561
562 function free() {
563 $this->mResultSet->free();
564 }
565 }
566
567 /**
568 * @ingroup Search
569 */
570 class SearchResultTooMany {
571 ## Some search engines may bail out if too many matches are found
572 }
573
574
575 /**
576 * @todo Fixme: This class is horribly factored. It would probably be better to
577 * have a useful base class to which you pass some standard information, then
578 * let the fancy self-highlighters extend that.
579 * @ingroup Search
580 */
581 class SearchResult {
582 var $mRevision = null;
583 var $mImage = null;
584
585 function __construct( $row ) {
586 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
587 if( !is_null($this->mTitle) ){
588 $this->mRevision = Revision::newFromTitle( $this->mTitle );
589 if( $this->mTitle->getNamespace() === NS_FILE )
590 $this->mImage = wfFindFile( $this->mTitle );
591 }
592 }
593
594 /**
595 * Check if this is result points to an invalid title
596 *
597 * @return Boolean
598 */
599 function isBrokenTitle(){
600 if( is_null($this->mTitle) )
601 return true;
602 return false;
603 }
604
605 /**
606 * Check if target page is missing, happens when index is out of date
607 *
608 * @return Boolean
609 */
610 function isMissingRevision(){
611 return !$this->mRevision && !$this->mImage;
612 }
613
614 /**
615 * @return Title
616 */
617 function getTitle() {
618 return $this->mTitle;
619 }
620
621 /**
622 * @return Double or null if not supported
623 */
624 function getScore() {
625 return null;
626 }
627
628 /**
629 * Lazy initialization of article text from DB
630 */
631 protected function initText(){
632 if( !isset($this->mText) ){
633 if($this->mRevision != null)
634 $this->mText = $this->mRevision->getText();
635 else // TODO: can we fetch raw wikitext for commons images?
636 $this->mText = '';
637
638 }
639 }
640
641 /**
642 * @param $terms Array: terms to highlight
643 * @return String: highlighted text snippet, null (and not '') if not supported
644 */
645 function getTextSnippet($terms){
646 global $wgUser, $wgAdvancedSearchHighlighting;
647 $this->initText();
648 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
649 $h = new SearchHighlighter();
650 if( $wgAdvancedSearchHighlighting )
651 return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
652 else
653 return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
654 }
655
656 /**
657 * @param $terms Array: terms to highlight
658 * @return String: highlighted title, '' if not supported
659 */
660 function getTitleSnippet($terms){
661 return '';
662 }
663
664 /**
665 * @param $terms Array: terms to highlight
666 * @return String: highlighted redirect name (redirect to this page), '' if none or not supported
667 */
668 function getRedirectSnippet($terms){
669 return '';
670 }
671
672 /**
673 * @return Title object for the redirect to this page, null if none or not supported
674 */
675 function getRedirectTitle(){
676 return null;
677 }
678
679 /**
680 * @return string highlighted relevant section name, null if none or not supported
681 */
682 function getSectionSnippet(){
683 return '';
684 }
685
686 /**
687 * @return Title object (pagename+fragment) for the section, null if none or not supported
688 */
689 function getSectionTitle(){
690 return null;
691 }
692
693 /**
694 * @return String: timestamp
695 */
696 function getTimestamp(){
697 if( $this->mRevision )
698 return $this->mRevision->getTimestamp();
699 else if( $this->mImage )
700 return $this->mImage->getTimestamp();
701 return '';
702 }
703
704 /**
705 * @return Integer: number of words
706 */
707 function getWordCount(){
708 $this->initText();
709 return str_word_count( $this->mText );
710 }
711
712 /**
713 * @return Integer: size in bytes
714 */
715 function getByteSize(){
716 $this->initText();
717 return strlen( $this->mText );
718 }
719
720 /**
721 * @return Boolean if hit has related articles
722 */
723 function hasRelated(){
724 return false;
725 }
726
727 /**
728 * @return String: interwiki prefix of the title (return iw even if title is broken)
729 */
730 function getInterwikiPrefix(){
731 return '';
732 }
733 }
734
735 /**
736 * Highlight bits of wikitext
737 *
738 * @ingroup Search
739 */
740 class SearchHighlighter {
741 var $mCleanWikitext = true;
742
743 function SearchHighlighter($cleanupWikitext = true){
744 $this->mCleanWikitext = $cleanupWikitext;
745 }
746
747 /**
748 * Default implementation of wikitext highlighting
749 *
750 * @param $text String
751 * @param $terms Array: terms to highlight (unescaped)
752 * @param $contextlines Integer
753 * @param $contextchars Integer
754 * @return String
755 */
756 public function highlightText( $text, $terms, $contextlines, $contextchars ) {
757 global $wgLang, $wgContLang;
758 global $wgSearchHighlightBoundaries;
759 $fname = __METHOD__;
760
761 if($text == '')
762 return '';
763
764 // spli text into text + templates/links/tables
765 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
766 // first capture group is for detecting nested templates/links/tables/references
767 $endPatterns = array(
768 1 => '/(\{\{)|(\}\})/', // template
769 2 => '/(\[\[)|(\]\])/', // image
770 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
771
772 // FIXME: this should prolly be a hook or something
773 if(function_exists('wfCite')){
774 $spat .= '|(<ref>)'; // references via cite extension
775 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
776 }
777 $spat .= '/';
778 $textExt = array(); // text extracts
779 $otherExt = array(); // other extracts
780 wfProfileIn( "$fname-split" );
781 $start = 0;
782 $textLen = strlen($text);
783 $count = 0; // sequence number to maintain ordering
784 while( $start < $textLen ){
785 // find start of template/image/table
786 if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
787 $epat = '';
788 foreach($matches as $key => $val){
789 if($key > 0 && $val[1] != -1){
790 if($key == 2){
791 // see if this is an image link
792 $ns = substr($val[0],2,-1);
793 if( $wgContLang->getNsIndex($ns) != NS_FILE )
794 break;
795
796 }
797 $epat = $endPatterns[$key];
798 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
799 $start = $val[1];
800 break;
801 }
802 }
803 if( $epat ){
804 // find end (and detect any nested elements)
805 $level = 0;
806 $offset = $start + 1;
807 $found = false;
808 while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
809 if( array_key_exists(2,$endMatches) ){
810 // found end
811 if($level == 0){
812 $len = strlen($endMatches[2][0]);
813 $off = $endMatches[2][1];
814 $this->splitAndAdd( $otherExt, $count,
815 substr( $text, $start, $off + $len - $start ) );
816 $start = $off + $len;
817 $found = true;
818 break;
819 } else{
820 // end of nested element
821 $level -= 1;
822 }
823 } else{
824 // nested
825 $level += 1;
826 }
827 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
828 }
829 if( ! $found ){
830 // couldn't find appropriate closing tag, skip
831 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
832 $start += strlen($matches[0][0]);
833 }
834 continue;
835 }
836 }
837 // else: add as text extract
838 $this->splitAndAdd( $textExt, $count, substr($text,$start) );
839 break;
840 }
841
842 $all = $textExt + $otherExt; // these have disjunct key sets
843
844 wfProfileOut( "$fname-split" );
845
846 // prepare regexps
847 foreach( $terms as $index => $term ) {
848 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
849 if(preg_match('/[\x80-\xff]/', $term) ){
850 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
851 } else {
852 $terms[$index] = $term;
853 }
854 }
855 $anyterm = implode( '|', $terms );
856 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
857
858 // FIXME: a hack to scale contextchars, a correct solution
859 // would be to have contextchars actually be char and not byte
860 // length, and do proper utf-8 substrings and lengths everywhere,
861 // but PHP is making that very hard and unclean to implement :(
862 $scale = strlen($anyterm) / mb_strlen($anyterm);
863 $contextchars = intval( $contextchars * $scale );
864
865 $patPre = "(^|$wgSearchHighlightBoundaries)";
866 $patPost = "($wgSearchHighlightBoundaries|$)";
867
868 $pat1 = "/(".$phrase.")/ui";
869 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
870
871 wfProfileIn( "$fname-extract" );
872
873 $left = $contextlines;
874
875 $snippets = array();
876 $offsets = array();
877
878 // show beginning only if it contains all words
879 $first = 0;
880 $firstText = '';
881 foreach($textExt as $index => $line){
882 if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
883 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
884 $first = $index;
885 break;
886 }
887 }
888 if( $firstText ){
889 $succ = true;
890 // check if first text contains all terms
891 foreach($terms as $term){
892 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
893 $succ = false;
894 break;
895 }
896 }
897 if( $succ ){
898 $snippets[$first] = $firstText;
899 $offsets[$first] = 0;
900 }
901 }
902 if( ! $snippets ) {
903 // match whole query on text
904 $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
905 // match whole query on templates/tables/images
906 $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
907 // match any words on text
908 $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
909 // match any words on templates/tables/images
910 $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
911
912 ksort($snippets);
913 }
914
915 // add extra chars to each snippet to make snippets constant size
916 $extended = array();
917 if( count( $snippets ) == 0){
918 // couldn't find the target words, just show beginning of article
919 $targetchars = $contextchars * $contextlines;
920 $snippets[$first] = '';
921 $offsets[$first] = 0;
922 } else{
923 // if begin of the article contains the whole phrase, show only that !!
924 if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
925 && $offsets[$first] < $contextchars * 2 ){
926 $snippets = array ($first => $snippets[$first]);
927 }
928
929 // calc by how much to extend existing snippets
930 $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
931 }
932
933 foreach($snippets as $index => $line){
934 $extended[$index] = $line;
935 $len = strlen($line);
936 if( $len < $targetchars - 20 ){
937 // complete this line
938 if($len < strlen( $all[$index] )){
939 $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
940 $len = strlen( $extended[$index] );
941 }
942
943 // add more lines
944 $add = $index + 1;
945 while( $len < $targetchars - 20
946 && array_key_exists($add,$all)
947 && !array_key_exists($add,$snippets) ){
948 $offsets[$add] = 0;
949 $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
950 $extended[$add] = $tt;
951 $len += strlen( $tt );
952 $add++;
953 }
954 }
955 }
956
957 //$snippets = array_map('htmlspecialchars', $extended);
958 $snippets = $extended;
959 $last = -1;
960 $extract = '';
961 foreach($snippets as $index => $line){
962 if($last == -1)
963 $extract .= $line; // first line
964 elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
965 $extract .= " ".$line; // continous lines
966 else
967 $extract .= '<b> ... </b>' . $line;
968
969 $last = $index;
970 }
971 if( $extract )
972 $extract .= '<b> ... </b>';
973
974 $processed = array();
975 foreach($terms as $term){
976 if( ! isset($processed[$term]) ){
977 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
978 $extract = preg_replace( $pat3,
979 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
980 $processed[$term] = true;
981 }
982 }
983
984 wfProfileOut( "$fname-extract" );
985
986 return $extract;
987 }
988
989 /**
990 * Split text into lines and add it to extracts array
991 *
992 * @param $extracts Array: index -> $line
993 * @param $count Integer
994 * @param $text String
995 */
996 function splitAndAdd(&$extracts, &$count, $text){
997 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
998 foreach($split as $line){
999 $tt = trim($line);
1000 if( $tt )
1001 $extracts[$count++] = $tt;
1002 }
1003 }
1004
1005 /**
1006 * Do manual case conversion for non-ascii chars
1007 *
1008 * @param $matches Array
1009 */
1010 function caseCallback($matches){
1011 global $wgContLang;
1012 if( strlen($matches[0]) > 1 ){
1013 return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
1014 } else
1015 return $matches[0];
1016 }
1017
1018 /**
1019 * Extract part of the text from start to end, but by
1020 * not chopping up words
1021 * @param $text String
1022 * @param $start Integer
1023 * @param $end Integer
1024 * @param $posStart Integer: (out) actual start position
1025 * @param $posEnd Integer: (out) actual end position
1026 * @return String
1027 */
1028 function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
1029 global $wgContLang;
1030
1031 if( $start != 0)
1032 $start = $this->position( $text, $start, 1 );
1033 if( $end >= strlen($text) )
1034 $end = strlen($text);
1035 else
1036 $end = $this->position( $text, $end );
1037
1038 if(!is_null($posStart))
1039 $posStart = $start;
1040 if(!is_null($posEnd))
1041 $posEnd = $end;
1042
1043 if($end > $start)
1044 return substr($text, $start, $end-$start);
1045 else
1046 return '';
1047 }
1048
1049 /**
1050 * Find a nonletter near a point (index) in the text
1051 *
1052 * @param $text String
1053 * @param $point Integer
1054 * @param $offset Integer: offset to found index
1055 * @return Integer: nearest nonletter index, or beginning of utf8 char if none
1056 */
1057 function position($text, $point, $offset=0 ){
1058 $tolerance = 10;
1059 $s = max( 0, $point - $tolerance );
1060 $l = min( strlen($text), $point + $tolerance ) - $s;
1061 $m = array();
1062 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
1063 return $m[0][1] + $s + $offset;
1064 } else{
1065 // check if point is on a valid first UTF8 char
1066 $char = ord( $text[$point] );
1067 while( $char >= 0x80 && $char < 0xc0 ) {
1068 // skip trailing bytes
1069 $point++;
1070 if($point >= strlen($text))
1071 return strlen($text);
1072 $char = ord( $text[$point] );
1073 }
1074 return $point;
1075
1076 }
1077 }
1078
1079 /**
1080 * Search extracts for a pattern, and return snippets
1081 *
1082 * @param $pattern String: regexp for matching lines
1083 * @param $extracts Array: extracts to search
1084 * @param $linesleft Integer: number of extracts to make
1085 * @param $contextchars Integer: length of snippet
1086 * @param $out Array: map for highlighted snippets
1087 * @param $offsets Array: map of starting points of snippets
1088 * @protected
1089 */
1090 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1091 if($linesleft == 0)
1092 return; // nothing to do
1093 foreach($extracts as $index => $line){
1094 if( array_key_exists($index,$out) )
1095 continue; // this line already highlighted
1096
1097 $m = array();
1098 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1099 continue;
1100
1101 $offset = $m[0][1];
1102 $len = strlen($m[0][0]);
1103 if($offset + $len < $contextchars)
1104 $begin = 0;
1105 elseif( $len > $contextchars)
1106 $begin = $offset;
1107 else
1108 $begin = $offset + intval( ($len - $contextchars) / 2 );
1109
1110 $end = $begin + $contextchars;
1111
1112 $posBegin = $begin;
1113 // basic snippet from this line
1114 $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1115 $offsets[$index] = $posBegin;
1116 $linesleft--;
1117 if($linesleft == 0)
1118 return;
1119 }
1120 }
1121
1122 /**
1123 * Basic wikitext removal
1124 * @protected
1125 */
1126 function removeWiki($text) {
1127 $fname = __METHOD__;
1128 wfProfileIn( $fname );
1129
1130 //$text = preg_replace("/'{2,5}/", "", $text);
1131 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1132 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1133 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1134 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1135 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1136 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1137 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1138 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1139 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1140 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1141 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1142 $text = preg_replace("/'''''/", "", $text);
1143 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1144 $text = preg_replace("/''/", "", $text);
1145
1146 wfProfileOut( $fname );
1147 return $text;
1148 }
1149
1150 /**
1151 * callback to replace [[target|caption]] kind of links, if
1152 * the target is category or image, leave it
1153 *
1154 * @param $matches Array
1155 */
1156 function linkReplace($matches){
1157 $colon = strpos( $matches[1], ':' );
1158 if( $colon === false )
1159 return $matches[2]; // replace with caption
1160 global $wgContLang;
1161 $ns = substr( $matches[1], 0, $colon );
1162 $index = $wgContLang->getNsIndex($ns);
1163 if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
1164 return $matches[0]; // return the whole thing
1165 else
1166 return $matches[2];
1167
1168 }
1169
1170 /**
1171 * Simple & fast snippet extraction, but gives completely unrelevant
1172 * snippets
1173 *
1174 * @param $text String
1175 * @param $terms Array
1176 * @param $contextlines Integer
1177 * @param $contextchars Integer
1178 * @return String
1179 */
1180 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1181 global $wgLang, $wgContLang;
1182 $fname = __METHOD__;
1183
1184 $lines = explode( "\n", $text );
1185
1186 $terms = implode( '|', $terms );
1187 $max = intval( $contextchars ) + 1;
1188 $pat1 = "/(.*)($terms)(.{0,$max})/i";
1189
1190 $lineno = 0;
1191
1192 $extract = "";
1193 wfProfileIn( "$fname-extract" );
1194 foreach ( $lines as $line ) {
1195 if ( 0 == $contextlines ) {
1196 break;
1197 }
1198 ++$lineno;
1199 $m = array();
1200 if ( ! preg_match( $pat1, $line, $m ) ) {
1201 continue;
1202 }
1203 --$contextlines;
1204 $pre = $wgContLang->truncate( $m[1], -$contextchars );
1205
1206 if ( count( $m ) < 3 ) {
1207 $post = '';
1208 } else {
1209 $post = $wgContLang->truncate( $m[3], $contextchars );
1210 }
1211
1212 $found = $m[2];
1213
1214 $line = htmlspecialchars( $pre . $found . $post );
1215 $pat2 = '/(' . $terms . ")/i";
1216 $line = preg_replace( $pat2,
1217 "<span class='searchmatch'>\\1</span>", $line );
1218
1219 $extract .= "${line}\n";
1220 }
1221 wfProfileOut( "$fname-extract" );
1222
1223 return $extract;
1224 }
1225
1226 }
1227
1228 /**
1229 * Dummy class to be used when non-supported Database engine is present.
1230 * @todo Fixme: dummy class should probably try something at least mildly useful,
1231 * such as a LIKE search through titles.
1232 * @ingroup Search
1233 */
1234 class SearchEngineDummy extends SearchEngine {
1235 // no-op
1236 }