Fix for r63578: also change wordSegmentation() to segmentByWord() here
[lhc/web/wiklou.git] / includes / search / SearchEngine.php
1 <?php
2 /**
3 * @defgroup Search Search
4 *
5 * @file
6 * @ingroup Search
7 */
8
9 /**
10 * Contain a class for special pages
11 * @ingroup Search
12 */
13 class SearchEngine {
14 var $limit = 10;
15 var $offset = 0;
16 var $prefix = '';
17 var $searchTerms = array();
18 var $namespaces = array( NS_MAIN );
19 var $showRedirects = false;
20
21 /**
22 * Perform a full text search query and return a result set.
23 * If title searches are not supported or disabled, return null.
24 * STUB
25 *
26 * @param $term String: raw search term
27 * @return SearchResultSet
28 */
29 function searchText( $term ) {
30 return null;
31 }
32
33 /**
34 * Perform a title-only search query and return a result set.
35 * If title searches are not supported or disabled, return null.
36 * STUB
37 *
38 * @param $term String: raw search term
39 * @return SearchResultSet
40 */
41 function searchTitle( $term ) {
42 return null;
43 }
44
45 /** If this search backend can list/unlist redirects */
46 function acceptListRedirects() {
47 return true;
48 }
49
50 /**
51 * When overridden in derived class, performs database-specific conversions
52 * on text to be used for searching or updating search index.
53 * Default implementation does nothing (simply returns $string).
54 *
55 * @param $string string: String to process
56 * @return string
57 */
58 public function normalizeText( $string ) {
59 global $wgContLang;
60
61 // Some languages such as Chinese require word segmentation
62 return $wgContLang->segmentByWord( $string );
63 }
64
65 /**
66 * Transform search term in cases when parts of the query came as different GET params (when supported)
67 * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
68 */
69 function transformSearchTerm( $term ) {
70 return $term;
71 }
72
73 /**
74 * If an exact title match can be found, or a very slightly close match,
75 * return the title. If no match, returns NULL.
76 *
77 * @param $searchterm String
78 * @return Title
79 */
80 public static function getNearMatch( $searchterm ) {
81 $title = self::getNearMatchInternal( $searchterm );
82
83 wfRunHooks( 'SearchGetNearMatchComplete', array( $searchterm, &$title ) );
84 return $title;
85 }
86
87 /**
88 * Really find the title match.
89 */
90 private static function getNearMatchInternal( $searchterm ) {
91 global $wgContLang;
92
93 $allSearchTerms = array($searchterm);
94
95 if ( $wgContLang->hasVariants() ) {
96 $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
97 }
98
99 if( !wfRunHooks( 'SearchGetNearMatchBefore', array( $allSearchTerms, &$titleResult ) ) ) {
100 return $titleResult;
101 }
102
103 foreach($allSearchTerms as $term) {
104
105 # Exact match? No need to look further.
106 $title = Title::newFromText( $term );
107 if (is_null($title))
108 return null;
109
110 if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
111 return $title;
112 }
113
114 # See if it still otherwise has content is some sane sense
115 $article = MediaWiki::articleFromTitle( $title );
116 if( $article->hasViewableContent() ) {
117 return $title;
118 }
119
120 # Now try all lower case (i.e. first letter capitalized)
121 #
122 $title = Title::newFromText( $wgContLang->lc( $term ) );
123 if ( $title && $title->exists() ) {
124 return $title;
125 }
126
127 # Now try capitalized string
128 #
129 $title = Title::newFromText( $wgContLang->ucwords( $term ) );
130 if ( $title && $title->exists() ) {
131 return $title;
132 }
133
134 # Now try all upper case
135 #
136 $title = Title::newFromText( $wgContLang->uc( $term ) );
137 if ( $title && $title->exists() ) {
138 return $title;
139 }
140
141 # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
142 $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
143 if ( $title && $title->exists() ) {
144 return $title;
145 }
146
147 // Give hooks a chance at better match variants
148 $title = null;
149 if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
150 return $title;
151 }
152 }
153
154 $title = Title::newFromText( $searchterm );
155
156 # Entering an IP address goes to the contributions page
157 if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
158 || User::isIP( trim( $searchterm ) ) ) {
159 return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
160 }
161
162
163 # Entering a user goes to the user page whether it's there or not
164 if ( $title->getNamespace() == NS_USER ) {
165 return $title;
166 }
167
168 # Go to images that exist even if there's no local page.
169 # There may have been a funny upload, or it may be on a shared
170 # file repository such as Wikimedia Commons.
171 if( $title->getNamespace() == NS_FILE ) {
172 $image = wfFindFile( $title );
173 if( $image ) {
174 return $title;
175 }
176 }
177
178 # MediaWiki namespace? Page may be "implied" if not customized.
179 # Just return it, with caps forced as the message system likes it.
180 if( $title->getNamespace() == NS_MEDIAWIKI ) {
181 return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
182 }
183
184 # Quoted term? Try without the quotes...
185 $matches = array();
186 if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
187 return SearchEngine::getNearMatch( $matches[1] );
188 }
189
190 return null;
191 }
192
193 public static function legalSearchChars() {
194 return "A-Za-z_'.0-9\\x80-\\xFF\\-";
195 }
196
197 /**
198 * Set the maximum number of results to return
199 * and how many to skip before returning the first.
200 *
201 * @param $limit Integer
202 * @param $offset Integer
203 */
204 function setLimitOffset( $limit, $offset = 0 ) {
205 $this->limit = intval( $limit );
206 $this->offset = intval( $offset );
207 }
208
209 /**
210 * Set which namespaces the search should include.
211 * Give an array of namespace index numbers.
212 *
213 * @param $namespaces Array
214 */
215 function setNamespaces( $namespaces ) {
216 $this->namespaces = $namespaces;
217 }
218
219 /**
220 * Parse some common prefixes: all (search everything)
221 * or namespace names
222 *
223 * @param $query String
224 */
225 function replacePrefixes( $query ){
226 global $wgContLang;
227
228 $parsed = $query;
229 if( strpos($query,':') === false ) { // nothing to do
230 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
231 return $parsed;
232 }
233
234 $allkeyword = wfMsgForContent('searchall').":";
235 if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
236 $this->namespaces = null;
237 $parsed = substr($query,strlen($allkeyword));
238 } else if( strpos($query,':') !== false ) {
239 $prefix = substr($query,0,strpos($query,':'));
240 $index = $wgContLang->getNsIndex($prefix);
241 if($index !== false){
242 $this->namespaces = array($index);
243 $parsed = substr($query,strlen($prefix)+1);
244 }
245 }
246 if(trim($parsed) == '')
247 $parsed = $query; // prefix was the whole query
248
249 wfRunHooks( 'SearchEngineReplacePrefixesComplete', array( $this, $query, &$parsed ) );
250
251 return $parsed;
252 }
253
254 /**
255 * Make a list of searchable namespaces and their canonical names.
256 * @return Array
257 */
258 public static function searchableNamespaces() {
259 global $wgContLang;
260 $arr = array();
261 foreach( $wgContLang->getNamespaces() as $ns => $name ) {
262 if( $ns >= NS_MAIN ) {
263 $arr[$ns] = $name;
264 }
265 }
266
267 wfRunHooks( 'SearchableNamespaces', array( &$arr ) );
268 return $arr;
269 }
270
271 /**
272 * Extract default namespaces to search from the given user's
273 * settings, returning a list of index numbers.
274 *
275 * @param $user User
276 * @return Array
277 */
278 public static function userNamespaces( $user ) {
279 global $wgSearchEverythingOnlyLoggedIn;
280
281 // get search everything preference, that can be set to be read for logged-in users
282 $searcheverything = false;
283 if( ( $wgSearchEverythingOnlyLoggedIn && $user->isLoggedIn() )
284 || !$wgSearchEverythingOnlyLoggedIn )
285 $searcheverything = $user->getOption('searcheverything');
286
287 // searcheverything overrides other options
288 if( $searcheverything )
289 return array_keys(SearchEngine::searchableNamespaces());
290
291 $arr = Preferences::loadOldSearchNs( $user );
292 $searchableNamespaces = SearchEngine::searchableNamespaces();
293
294 $arr = array_intersect( $arr, array_keys($searchableNamespaces) ); // Filter
295
296 return $arr;
297 }
298
299 /**
300 * Find snippet highlight settings for a given user
301 *
302 * @param $user User
303 * @return Array contextlines, contextchars
304 */
305 public static function userHighlightPrefs( &$user ){
306 //$contextlines = $user->getOption( 'contextlines', 5 );
307 //$contextchars = $user->getOption( 'contextchars', 50 );
308 $contextlines = 2; // Hardcode this. Old defaults sucked. :)
309 $contextchars = 75; // same as above.... :P
310 return array($contextlines, $contextchars);
311 }
312
313 /**
314 * An array of namespaces indexes to be searched by default
315 *
316 * @return Array
317 */
318 public static function defaultNamespaces(){
319 global $wgNamespacesToBeSearchedDefault;
320
321 return array_keys($wgNamespacesToBeSearchedDefault, true);
322 }
323
324 /**
325 * Get a list of namespace names useful for showing in tooltips
326 * and preferences
327 *
328 * @param $namespaces Array
329 */
330 public static function namespacesAsText( $namespaces ){
331 global $wgContLang;
332
333 $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
334 foreach( $formatted as $key => $ns ){
335 if ( empty($ns) )
336 $formatted[$key] = wfMsg( 'blanknamespace' );
337 }
338 return $formatted;
339 }
340
341 /**
342 * Return the help namespaces to be shown on Special:Search
343 *
344 * @return Array
345 */
346 public static function helpNamespaces() {
347 global $wgNamespacesToBeSearchedHelp;
348
349 return array_keys( $wgNamespacesToBeSearchedHelp, true );
350 }
351
352 /**
353 * Return a 'cleaned up' search string
354 *
355 * @param $text String
356 * @return String
357 */
358 function filter( $text ) {
359 $lc = $this->legalSearchChars();
360 return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
361 }
362 /**
363 * Load up the appropriate search engine class for the currently
364 * active database backend, and return a configured instance.
365 *
366 * @return SearchEngine
367 */
368 public static function create() {
369 global $wgSearchType;
370 $dbr = wfGetDB( DB_SLAVE );
371 if( $wgSearchType ) {
372 $class = $wgSearchType;
373 } else {
374 $class = $dbr->getSearchEngine();
375 }
376 $search = new $class( $dbr );
377 $search->setLimitOffset(0,0);
378 return $search;
379 }
380
381 /**
382 * Create or update the search index record for the given page.
383 * Title and text should be pre-processed.
384 * STUB
385 *
386 * @param $id Integer
387 * @param $title String
388 * @param $text String
389 */
390 function update( $id, $title, $text ) {
391 // no-op
392 }
393
394 /**
395 * Update a search index record's title only.
396 * Title should be pre-processed.
397 * STUB
398 *
399 * @param $id Integer
400 * @param $title String
401 */
402 function updateTitle( $id, $title ) {
403 // no-op
404 }
405
406 /**
407 * Get OpenSearch suggestion template
408 *
409 * @return String
410 */
411 public static function getOpenSearchTemplate() {
412 global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
413 if( $wgOpenSearchTemplate ) {
414 return $wgOpenSearchTemplate;
415 } else {
416 $ns = implode( '|', SearchEngine::defaultNamespaces() );
417 if( !$ns ) $ns = "0";
418 return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
419 }
420 }
421
422 /**
423 * Get internal MediaWiki Suggest template
424 *
425 * @return String
426 */
427 public static function getMWSuggestTemplate() {
428 global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
429 if($wgMWSuggestTemplate)
430 return $wgMWSuggestTemplate;
431 else
432 return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
433 }
434 }
435
436 /**
437 * @ingroup Search
438 */
439 class SearchResultSet {
440 /**
441 * Fetch an array of regular expression fragments for matching
442 * the search terms as parsed by this engine in a text extract.
443 * STUB
444 *
445 * @return Array
446 */
447 function termMatches() {
448 return array();
449 }
450
451 function numRows() {
452 return 0;
453 }
454
455 /**
456 * Return true if results are included in this result set.
457 * STUB
458 *
459 * @return Boolean
460 */
461 function hasResults() {
462 return false;
463 }
464
465 /**
466 * Some search modes return a total hit count for the query
467 * in the entire article database. This may include pages
468 * in namespaces that would not be matched on the given
469 * settings.
470 *
471 * Return null if no total hits number is supported.
472 *
473 * @return Integer
474 */
475 function getTotalHits() {
476 return null;
477 }
478
479 /**
480 * Some search modes return a suggested alternate term if there are
481 * no exact hits. Returns true if there is one on this set.
482 *
483 * @return Boolean
484 */
485 function hasSuggestion() {
486 return false;
487 }
488
489 /**
490 * @return String: suggested query, null if none
491 */
492 function getSuggestionQuery(){
493 return null;
494 }
495
496 /**
497 * @return String: HTML highlighted suggested query, '' if none
498 */
499 function getSuggestionSnippet(){
500 return '';
501 }
502
503 /**
504 * Return information about how and from where the results were fetched,
505 * should be useful for diagnostics and debugging
506 *
507 * @return String
508 */
509 function getInfo() {
510 return null;
511 }
512
513 /**
514 * Return a result set of hits on other (multiple) wikis associated with this one
515 *
516 * @return SearchResultSet
517 */
518 function getInterwikiResults() {
519 return null;
520 }
521
522 /**
523 * Check if there are results on other wikis
524 *
525 * @return Boolean
526 */
527 function hasInterwikiResults() {
528 return $this->getInterwikiResults() != null;
529 }
530
531 /**
532 * Fetches next search result, or false.
533 * STUB
534 *
535 * @return SearchResult
536 */
537 function next() {
538 return false;
539 }
540
541 /**
542 * Frees the result set, if applicable.
543 */
544 function free() {
545 // ...
546 }
547 }
548
549 /**
550 * This class is used for different SQL-based search engines shipped with MediaWiki
551 */
552 class SqlSearchResultSet extends SearchResultSet {
553 function __construct( $resultSet, $terms ) {
554 $this->mResultSet = $resultSet;
555 $this->mTerms = $terms;
556 }
557
558 function termMatches() {
559 return $this->mTerms;
560 }
561
562 function numRows() {
563 if ($this->mResultSet === false )
564 return false;
565
566 return $this->mResultSet->numRows();
567 }
568
569 function next() {
570 if ($this->mResultSet === false )
571 return false;
572
573 $row = $this->mResultSet->fetchObject();
574 if ($row === false)
575 return false;
576 return new SearchResult($row);
577 }
578
579 function free() {
580 if ($this->mResultSet === false )
581 return false;
582
583 $this->mResultSet->free();
584 }
585 }
586
587 /**
588 * @ingroup Search
589 */
590 class SearchResultTooMany {
591 ## Some search engines may bail out if too many matches are found
592 }
593
594
595 /**
596 * @todo Fixme: This class is horribly factored. It would probably be better to
597 * have a useful base class to which you pass some standard information, then
598 * let the fancy self-highlighters extend that.
599 * @ingroup Search
600 */
601 class SearchResult {
602 var $mRevision = null;
603 var $mImage = null;
604
605 function __construct( $row ) {
606 $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
607 if( !is_null($this->mTitle) ){
608 $this->mRevision = Revision::newFromTitle( $this->mTitle );
609 if( $this->mTitle->getNamespace() === NS_FILE )
610 $this->mImage = wfFindFile( $this->mTitle );
611 }
612 }
613
614 /**
615 * Check if this is result points to an invalid title
616 *
617 * @return Boolean
618 */
619 function isBrokenTitle(){
620 if( is_null($this->mTitle) )
621 return true;
622 return false;
623 }
624
625 /**
626 * Check if target page is missing, happens when index is out of date
627 *
628 * @return Boolean
629 */
630 function isMissingRevision(){
631 return !$this->mRevision && !$this->mImage;
632 }
633
634 /**
635 * @return Title
636 */
637 function getTitle() {
638 return $this->mTitle;
639 }
640
641 /**
642 * @return Double or null if not supported
643 */
644 function getScore() {
645 return null;
646 }
647
648 /**
649 * Lazy initialization of article text from DB
650 */
651 protected function initText(){
652 if( !isset($this->mText) ){
653 if($this->mRevision != null)
654 $this->mText = $this->mRevision->getText();
655 else // TODO: can we fetch raw wikitext for commons images?
656 $this->mText = '';
657
658 }
659 }
660
661 /**
662 * @param $terms Array: terms to highlight
663 * @return String: highlighted text snippet, null (and not '') if not supported
664 */
665 function getTextSnippet($terms){
666 global $wgUser, $wgAdvancedSearchHighlighting;
667 $this->initText();
668 list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
669 $h = new SearchHighlighter();
670 if( $wgAdvancedSearchHighlighting )
671 return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
672 else
673 return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
674 }
675
676 /**
677 * @param $terms Array: terms to highlight
678 * @return String: highlighted title, '' if not supported
679 */
680 function getTitleSnippet($terms){
681 return '';
682 }
683
684 /**
685 * @param $terms Array: terms to highlight
686 * @return String: highlighted redirect name (redirect to this page), '' if none or not supported
687 */
688 function getRedirectSnippet($terms){
689 return '';
690 }
691
692 /**
693 * @return Title object for the redirect to this page, null if none or not supported
694 */
695 function getRedirectTitle(){
696 return null;
697 }
698
699 /**
700 * @return string highlighted relevant section name, null if none or not supported
701 */
702 function getSectionSnippet(){
703 return '';
704 }
705
706 /**
707 * @return Title object (pagename+fragment) for the section, null if none or not supported
708 */
709 function getSectionTitle(){
710 return null;
711 }
712
713 /**
714 * @return String: timestamp
715 */
716 function getTimestamp(){
717 if( $this->mRevision )
718 return $this->mRevision->getTimestamp();
719 else if( $this->mImage )
720 return $this->mImage->getTimestamp();
721 return '';
722 }
723
724 /**
725 * @return Integer: number of words
726 */
727 function getWordCount(){
728 $this->initText();
729 return str_word_count( $this->mText );
730 }
731
732 /**
733 * @return Integer: size in bytes
734 */
735 function getByteSize(){
736 $this->initText();
737 return strlen( $this->mText );
738 }
739
740 /**
741 * @return Boolean if hit has related articles
742 */
743 function hasRelated(){
744 return false;
745 }
746
747 /**
748 * @return String: interwiki prefix of the title (return iw even if title is broken)
749 */
750 function getInterwikiPrefix(){
751 return '';
752 }
753 }
754
755 /**
756 * Highlight bits of wikitext
757 *
758 * @ingroup Search
759 */
760 class SearchHighlighter {
761 var $mCleanWikitext = true;
762
763 function SearchHighlighter($cleanupWikitext = true){
764 $this->mCleanWikitext = $cleanupWikitext;
765 }
766
767 /**
768 * Default implementation of wikitext highlighting
769 *
770 * @param $text String
771 * @param $terms Array: terms to highlight (unescaped)
772 * @param $contextlines Integer
773 * @param $contextchars Integer
774 * @return String
775 */
776 public function highlightText( $text, $terms, $contextlines, $contextchars ) {
777 global $wgLang, $wgContLang;
778 global $wgSearchHighlightBoundaries;
779 $fname = __METHOD__;
780
781 if($text == '')
782 return '';
783
784 // spli text into text + templates/links/tables
785 $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
786 // first capture group is for detecting nested templates/links/tables/references
787 $endPatterns = array(
788 1 => '/(\{\{)|(\}\})/', // template
789 2 => '/(\[\[)|(\]\])/', // image
790 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
791
792 // FIXME: this should prolly be a hook or something
793 if(function_exists('wfCite')){
794 $spat .= '|(<ref>)'; // references via cite extension
795 $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
796 }
797 $spat .= '/';
798 $textExt = array(); // text extracts
799 $otherExt = array(); // other extracts
800 wfProfileIn( "$fname-split" );
801 $start = 0;
802 $textLen = strlen($text);
803 $count = 0; // sequence number to maintain ordering
804 while( $start < $textLen ){
805 // find start of template/image/table
806 if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
807 $epat = '';
808 foreach($matches as $key => $val){
809 if($key > 0 && $val[1] != -1){
810 if($key == 2){
811 // see if this is an image link
812 $ns = substr($val[0],2,-1);
813 if( $wgContLang->getNsIndex($ns) != NS_FILE )
814 break;
815
816 }
817 $epat = $endPatterns[$key];
818 $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
819 $start = $val[1];
820 break;
821 }
822 }
823 if( $epat ){
824 // find end (and detect any nested elements)
825 $level = 0;
826 $offset = $start + 1;
827 $found = false;
828 while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
829 if( array_key_exists(2,$endMatches) ){
830 // found end
831 if($level == 0){
832 $len = strlen($endMatches[2][0]);
833 $off = $endMatches[2][1];
834 $this->splitAndAdd( $otherExt, $count,
835 substr( $text, $start, $off + $len - $start ) );
836 $start = $off + $len;
837 $found = true;
838 break;
839 } else{
840 // end of nested element
841 $level -= 1;
842 }
843 } else{
844 // nested
845 $level += 1;
846 }
847 $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
848 }
849 if( ! $found ){
850 // couldn't find appropriate closing tag, skip
851 $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
852 $start += strlen($matches[0][0]);
853 }
854 continue;
855 }
856 }
857 // else: add as text extract
858 $this->splitAndAdd( $textExt, $count, substr($text,$start) );
859 break;
860 }
861
862 $all = $textExt + $otherExt; // these have disjunct key sets
863
864 wfProfileOut( "$fname-split" );
865
866 // prepare regexps
867 foreach( $terms as $index => $term ) {
868 // manually do upper/lowercase stuff for utf-8 since PHP won't do it
869 if(preg_match('/[\x80-\xff]/', $term) ){
870 $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
871 } else {
872 $terms[$index] = $term;
873 }
874 }
875 $anyterm = implode( '|', $terms );
876 $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
877
878 // FIXME: a hack to scale contextchars, a correct solution
879 // would be to have contextchars actually be char and not byte
880 // length, and do proper utf-8 substrings and lengths everywhere,
881 // but PHP is making that very hard and unclean to implement :(
882 $scale = strlen($anyterm) / mb_strlen($anyterm);
883 $contextchars = intval( $contextchars * $scale );
884
885 $patPre = "(^|$wgSearchHighlightBoundaries)";
886 $patPost = "($wgSearchHighlightBoundaries|$)";
887
888 $pat1 = "/(".$phrase.")/ui";
889 $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
890
891 wfProfileIn( "$fname-extract" );
892
893 $left = $contextlines;
894
895 $snippets = array();
896 $offsets = array();
897
898 // show beginning only if it contains all words
899 $first = 0;
900 $firstText = '';
901 foreach($textExt as $index => $line){
902 if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
903 $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
904 $first = $index;
905 break;
906 }
907 }
908 if( $firstText ){
909 $succ = true;
910 // check if first text contains all terms
911 foreach($terms as $term){
912 if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
913 $succ = false;
914 break;
915 }
916 }
917 if( $succ ){
918 $snippets[$first] = $firstText;
919 $offsets[$first] = 0;
920 }
921 }
922 if( ! $snippets ) {
923 // match whole query on text
924 $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
925 // match whole query on templates/tables/images
926 $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
927 // match any words on text
928 $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
929 // match any words on templates/tables/images
930 $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
931
932 ksort($snippets);
933 }
934
935 // add extra chars to each snippet to make snippets constant size
936 $extended = array();
937 if( count( $snippets ) == 0){
938 // couldn't find the target words, just show beginning of article
939 $targetchars = $contextchars * $contextlines;
940 $snippets[$first] = '';
941 $offsets[$first] = 0;
942 } else{
943 // if begin of the article contains the whole phrase, show only that !!
944 if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
945 && $offsets[$first] < $contextchars * 2 ){
946 $snippets = array ($first => $snippets[$first]);
947 }
948
949 // calc by how much to extend existing snippets
950 $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
951 }
952
953 foreach($snippets as $index => $line){
954 $extended[$index] = $line;
955 $len = strlen($line);
956 if( $len < $targetchars - 20 ){
957 // complete this line
958 if($len < strlen( $all[$index] )){
959 $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
960 $len = strlen( $extended[$index] );
961 }
962
963 // add more lines
964 $add = $index + 1;
965 while( $len < $targetchars - 20
966 && array_key_exists($add,$all)
967 && !array_key_exists($add,$snippets) ){
968 $offsets[$add] = 0;
969 $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
970 $extended[$add] = $tt;
971 $len += strlen( $tt );
972 $add++;
973 }
974 }
975 }
976
977 //$snippets = array_map('htmlspecialchars', $extended);
978 $snippets = $extended;
979 $last = -1;
980 $extract = '';
981 foreach($snippets as $index => $line){
982 if($last == -1)
983 $extract .= $line; // first line
984 elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
985 $extract .= " ".$line; // continous lines
986 else
987 $extract .= '<b> ... </b>' . $line;
988
989 $last = $index;
990 }
991 if( $extract )
992 $extract .= '<b> ... </b>';
993
994 $processed = array();
995 foreach($terms as $term){
996 if( ! isset($processed[$term]) ){
997 $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
998 $extract = preg_replace( $pat3,
999 "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
1000 $processed[$term] = true;
1001 }
1002 }
1003
1004 wfProfileOut( "$fname-extract" );
1005
1006 return $extract;
1007 }
1008
1009 /**
1010 * Split text into lines and add it to extracts array
1011 *
1012 * @param $extracts Array: index -> $line
1013 * @param $count Integer
1014 * @param $text String
1015 */
1016 function splitAndAdd(&$extracts, &$count, $text){
1017 $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
1018 foreach($split as $line){
1019 $tt = trim($line);
1020 if( $tt )
1021 $extracts[$count++] = $tt;
1022 }
1023 }
1024
1025 /**
1026 * Do manual case conversion for non-ascii chars
1027 *
1028 * @param $matches Array
1029 */
1030 function caseCallback($matches){
1031 global $wgContLang;
1032 if( strlen($matches[0]) > 1 ){
1033 return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
1034 } else
1035 return $matches[0];
1036 }
1037
1038 /**
1039 * Extract part of the text from start to end, but by
1040 * not chopping up words
1041 * @param $text String
1042 * @param $start Integer
1043 * @param $end Integer
1044 * @param $posStart Integer: (out) actual start position
1045 * @param $posEnd Integer: (out) actual end position
1046 * @return String
1047 */
1048 function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
1049 global $wgContLang;
1050
1051 if( $start != 0)
1052 $start = $this->position( $text, $start, 1 );
1053 if( $end >= strlen($text) )
1054 $end = strlen($text);
1055 else
1056 $end = $this->position( $text, $end );
1057
1058 if(!is_null($posStart))
1059 $posStart = $start;
1060 if(!is_null($posEnd))
1061 $posEnd = $end;
1062
1063 if($end > $start)
1064 return substr($text, $start, $end-$start);
1065 else
1066 return '';
1067 }
1068
1069 /**
1070 * Find a nonletter near a point (index) in the text
1071 *
1072 * @param $text String
1073 * @param $point Integer
1074 * @param $offset Integer: offset to found index
1075 * @return Integer: nearest nonletter index, or beginning of utf8 char if none
1076 */
1077 function position($text, $point, $offset=0 ){
1078 $tolerance = 10;
1079 $s = max( 0, $point - $tolerance );
1080 $l = min( strlen($text), $point + $tolerance ) - $s;
1081 $m = array();
1082 if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
1083 return $m[0][1] + $s + $offset;
1084 } else{
1085 // check if point is on a valid first UTF8 char
1086 $char = ord( $text[$point] );
1087 while( $char >= 0x80 && $char < 0xc0 ) {
1088 // skip trailing bytes
1089 $point++;
1090 if($point >= strlen($text))
1091 return strlen($text);
1092 $char = ord( $text[$point] );
1093 }
1094 return $point;
1095
1096 }
1097 }
1098
1099 /**
1100 * Search extracts for a pattern, and return snippets
1101 *
1102 * @param $pattern String: regexp for matching lines
1103 * @param $extracts Array: extracts to search
1104 * @param $linesleft Integer: number of extracts to make
1105 * @param $contextchars Integer: length of snippet
1106 * @param $out Array: map for highlighted snippets
1107 * @param $offsets Array: map of starting points of snippets
1108 * @protected
1109 */
1110 function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
1111 if($linesleft == 0)
1112 return; // nothing to do
1113 foreach($extracts as $index => $line){
1114 if( array_key_exists($index,$out) )
1115 continue; // this line already highlighted
1116
1117 $m = array();
1118 if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
1119 continue;
1120
1121 $offset = $m[0][1];
1122 $len = strlen($m[0][0]);
1123 if($offset + $len < $contextchars)
1124 $begin = 0;
1125 elseif( $len > $contextchars)
1126 $begin = $offset;
1127 else
1128 $begin = $offset + intval( ($len - $contextchars) / 2 );
1129
1130 $end = $begin + $contextchars;
1131
1132 $posBegin = $begin;
1133 // basic snippet from this line
1134 $out[$index] = $this->extract($line,$begin,$end,$posBegin);
1135 $offsets[$index] = $posBegin;
1136 $linesleft--;
1137 if($linesleft == 0)
1138 return;
1139 }
1140 }
1141
1142 /**
1143 * Basic wikitext removal
1144 * @protected
1145 */
1146 function removeWiki($text) {
1147 $fname = __METHOD__;
1148 wfProfileIn( $fname );
1149
1150 //$text = preg_replace("/'{2,5}/", "", $text);
1151 //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
1152 //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
1153 //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
1154 //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
1155 //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
1156 $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
1157 $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
1158 $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
1159 $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
1160 //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
1161 $text = preg_replace("/<\/?[^>]+>/", "", $text);
1162 $text = preg_replace("/'''''/", "", $text);
1163 $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
1164 $text = preg_replace("/''/", "", $text);
1165
1166 wfProfileOut( $fname );
1167 return $text;
1168 }
1169
1170 /**
1171 * callback to replace [[target|caption]] kind of links, if
1172 * the target is category or image, leave it
1173 *
1174 * @param $matches Array
1175 */
1176 function linkReplace($matches){
1177 $colon = strpos( $matches[1], ':' );
1178 if( $colon === false )
1179 return $matches[2]; // replace with caption
1180 global $wgContLang;
1181 $ns = substr( $matches[1], 0, $colon );
1182 $index = $wgContLang->getNsIndex($ns);
1183 if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
1184 return $matches[0]; // return the whole thing
1185 else
1186 return $matches[2];
1187
1188 }
1189
1190 /**
1191 * Simple & fast snippet extraction, but gives completely unrelevant
1192 * snippets
1193 *
1194 * @param $text String
1195 * @param $terms Array
1196 * @param $contextlines Integer
1197 * @param $contextchars Integer
1198 * @return String
1199 */
1200 public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
1201 global $wgLang, $wgContLang;
1202 $fname = __METHOD__;
1203
1204 $lines = explode( "\n", $text );
1205
1206 $terms = implode( '|', $terms );
1207 $max = intval( $contextchars ) + 1;
1208 $pat1 = "/(.*)($terms)(.{0,$max})/i";
1209
1210 $lineno = 0;
1211
1212 $extract = "";
1213 wfProfileIn( "$fname-extract" );
1214 foreach ( $lines as $line ) {
1215 if ( 0 == $contextlines ) {
1216 break;
1217 }
1218 ++$lineno;
1219 $m = array();
1220 if ( ! preg_match( $pat1, $line, $m ) ) {
1221 continue;
1222 }
1223 --$contextlines;
1224 $pre = $wgContLang->truncate( $m[1], -$contextchars );
1225
1226 if ( count( $m ) < 3 ) {
1227 $post = '';
1228 } else {
1229 $post = $wgContLang->truncate( $m[3], $contextchars );
1230 }
1231
1232 $found = $m[2];
1233
1234 $line = htmlspecialchars( $pre . $found . $post );
1235 $pat2 = '/(' . $terms . ")/i";
1236 $line = preg_replace( $pat2,
1237 "<span class='searchmatch'>\\1</span>", $line );
1238
1239 $extract .= "${line}\n";
1240 }
1241 wfProfileOut( "$fname-extract" );
1242
1243 return $extract;
1244 }
1245
1246 }
1247
1248 /**
1249 * Dummy class to be used when non-supported Database engine is present.
1250 * @todo Fixme: dummy class should probably try something at least mildly useful,
1251 * such as a LIKE search through titles.
1252 * @ingroup Search
1253 */
1254 class SearchEngineDummy extends SearchEngine {
1255 // no-op
1256 }