-/**
- * Highlight bits of wikitext
- *
- * @addtogroup Search
- */
-class SearchHighlighter {
- var $mCleanWikitext = true;
-
- function SearchHighlighter($cleanupWikitext = true){
- $this->mCleanWikitext = $cleanupWikitext;
- }
-
- /**
- * Default implementation of wikitext highlighting
- *
- * @param string $text
- * @param array $terms Terms to highlight (unescaped)
- * @param int $contextlines
- * @param int $contextchars
- * @return string
- */
- public function highlightText( $text, $terms, $contextlines, $contextchars ) {
- global $wgLang, $wgContLang;
- $fname = __METHOD__;
-
- if($text == '')
- return '';
-
- // spli text into text + templates/links/tables
- $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)/";
- // first capture group is for detecting nested templates/links/tables
- $endPatterns = array(
- 1 => '/(\{\{)|(\}\})/', // template
- 2 => '/(\[\[)|(\]\])/', // image
- 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
- $textExt = array(); // text extracts
- $otherExt = array(); // other extracts
- wfProfileIn( "$fname-split" );
- $start = 0;
- $textLen = strlen($text);
- $count = 0; // sequence number to maintain ordering
- while( $start < $textLen ){
- // find start of template/image/table
- if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
- $epat = '';
- foreach($matches as $key => $val){
- if($key > 0 && $val[1] != -1){
- if($key == 2){
- // see if this is an image link
- $ns = substr($val[0],2,-1);
- if( $wgContLang->getNsIndex($ns) != NS_IMAGE )
- break;
-
- }
- $epat = $endPatterns[$key];
- $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
- $start = $val[1];
- break;
- }
- }
- if( $epat ){
- // find end (and detect any nested elements)
- $level = 0;
- $offset = $start + 1;
- $found = false;
- while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
- if( array_key_exists(2,$endMatches) ){
- // found end
- if($level == 0){
- $len = strlen($endMatches[2][0]);
- $off = $endMatches[2][1];
- $this->splitAndAdd( $otherExt, $count,
- substr( $text, $start, $off + $len - $start ) );
- $start = $off + $len;
- $found = true;
- break;
- } else{
- // end of nested element
- $level -= 1;
- }
- } else{
- // nested
- $level += 1;
- }
- $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
- }
- if( ! $found ){
- // couldn't find appropriate closing tag, skip
- $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
- $start += strlen($matches[0][0]);
- }
- continue;
- }
- }
- // else: add as text extract
- $this->splitAndAdd( $textExt, $count, substr($text,$start) );
- break;
- }
-
- $all = $textExt + $otherExt; // these have disjunct key sets
-
- wfProfileOut( "$fname-split" );
-
- // prepare regexps
- foreach( $terms as $index => $term ) {
- $terms[$index] = preg_quote( $term, '/' );
- // manually do upper/lowercase stuff for utf-8 since PHP won't do it
- if(preg_match('/[\x80-\xff]/', $term) ){
- $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
- }
-
-
- }
- $anyterm = implode( '|', $terms );
- $phrase = implode('[, .:;\(\)"\'\-\+]+', $terms );
-
- // FIXME: a hack to scale contextchars, a correct solution
- // would be to have contextchars actually be char and not byte
- // length, and do proper utf-8 substrings and lengths everywhere,
- // but PHP is making that very hard and unclean to implement :(
- $scale = strlen($anyterm) / mb_strlen($anyterm);
- $contextchars = intval( $contextchars * $scale );
-
- $pat1 = '/('.$phrase.')/ui';
- $pat2 = '/('.$anyterm.')/ui';
-
- wfProfileIn( "$fname-extract" );
-
- $left = $contextlines;
-
- $snippets = array();
- $offsets = array();
- // match whole query on text
- $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
- // match whole query on templates/tables/images
- $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
- // match any words on text
- $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
- // match any words on templates/tables/images
- $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
-
- ksort($snippets);
-
- $first = array_keys($textExt);
- if( isset($first[0]))
- $first = $first[0];
- else
- $first = 0;
-
- // add extra chars to each snippet to make snippets constant size
- $extended = array();
- if( count( $snippets ) == 0){
- // couldn't find the target words, just show beginning of article
- $targetchars = $contextchars * $contextlines;
- $snippets[$first] = '';
- $offsets[$first] = 0;
- } else{
- // if begin of the article contains the whole phrase, show only that !!
- if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
- && $offsets[$first] < $contextchars * 2 ){
- $snippets = array ($first => $snippets[$first]);
- }
-
- // calc by how much to extend existing snippets
- $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
- }
-
- foreach($snippets as $index => $line){
- $extended[$index] = $line;
- $len = strlen($line);
- if( $len < $targetchars - 20 ){
- // complete this line
- if($len < strlen( $all[$index] )){
- $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
- $len = strlen( $extended[$index] );
- }
-
- // add more lines
- $add = $index + 1;
- while( $len < $targetchars - 20
- && array_key_exists($add,$all)
- && !array_key_exists($add,$snippets) ){
- $offsets[$add] = 0;
- $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
- $extended[$add] = $tt;
- $len += strlen( $tt );
- $add++;
- }
- }
- }
-
- $snippets = array_map('htmlspecialchars', $extended);
- $last = -1;
- $extract = '';
- foreach($snippets as $index => $line){
- if($last == -1)
- $extract .= $line; // first line
- elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
- $extract .= " ".$line; // continous lines
- else
- $extract .= '<b> ... </b>' . $line;
-
- $last = $index;
- }
- if( $extract )
- $extract .= '<b> ... </b>';
-
- // highlight words
- $pat3 = '/(' . $anyterm . ")/ui";
- $extract = preg_replace( $pat3,
- "<span class='searchmatch'>\\1</span>", $extract );
-
- wfProfileOut( "$fname-extract" );
-
- return $extract;
- }
-
- /**
- * Split text into lines and add it to extracts array
- *
- * @param array $extracts index -> $line
- * @param int $count
- * @param string $text
- */
- function splitAndAdd(&$extracts, &$count, $text){
- $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
- foreach($split as $line){
- $tt = trim($line);
- if( $tt )
- $extracts[$count++] = $tt;
- }
- }
-
- /**
- * Do manual case conversion for non-ascii chars
- *
- * @param unknown_type $matches
- */
- function caseCallback($matches){
- global $wgContLang;
- if( strlen($matches[0]) > 1 ){
- return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
- } else
- return $matches[0];
- }
-
- /**
- * Extract part of the text from start to end, but by
- * not chopping up words
- * @param string $text
- * @param int $start
- * @param int $end
- * @param int $posStart (out) actual start position
- * @param int $posEnd (out) actual end position
- * @return string
- */
- function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
- global $wgContLang;
-
- if( $start != 0)
- $start = $this->position( $text, $start, 1 );
- if( $end >= strlen($text) )
- $end = strlen($text);
- else
- $end = $this->position( $text, $end );
-
- if(!is_null($posStart))
- $posStart = $start;
- if(!is_null($posEnd))
- $posEnd = $end;
-
- if($end > $start)
- return substr($text, $start, $end-$start);
- else
- return '';
- }
-
- /**
- * Find a nonletter near a point (index) in the text
- *
- * @param string $text
- * @param int $point
- * @param int $offset to found index
- * @return int nearest nonletter index, or beginning of utf8 char if none
- */
- function position($text, $point, $offset=0 ){
- $tolerance = 10;
- $s = max( 0, $point - $tolerance );
- $l = min( strlen($text), $point + $tolerance ) - $s;
- $m = array();
- if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
- return $m[0][1] + $s + $offset;
- } else{
- // check if point is on a valid first UTF8 char
- $char = ord( $text[$point] );
- while( $char >= 0x80 && $char < 0xc0 ) {
- // skip trailing bytes
- $point++;
- if($point >= strlen($text))
- return strlen($text);
- $char = ord( $text[$point] );
- }
- return $point;
-
- }
- }
-
- /**
- * Search extracts for a pattern, and return snippets
- *
- * @param string $pattern regexp for matching lines
- * @param array $extracts extracts to search
- * @param int $linesleft number of extracts to make
- * @param int $contextchars length of snippet
- * @param array $out map for highlighted snippets
- * @param array $offsets map of starting points of snippets
- * @protected
- */
- function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
- if($linesleft == 0)
- return; // nothing to do
- foreach($extracts as $index => $line){
- if( array_key_exists($index,$out) )
- continue; // this line already highlighted
-
- $m = array();
- if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
- continue;
-
- $offset = $m[0][1];
- $len = strlen($m[0][0]);
- if($offset + $len < $contextchars)
- $begin = 0;
- elseif( $len > $contextchars)
- $begin = $offset;
- else
- $begin = $offset + intval( ($len - $contextchars) / 2 );
-
- $end = $begin + $contextchars;
-
- $posBegin = $begin;
- // basic snippet from this line
- $out[$index] = $this->extract($line,$begin,$end,$posBegin);
- $offsets[$index] = $posBegin;
- $linesleft--;
- if($linesleft == 0)
- return;
- }
- }
-
- /**
- * Basic wikitext removal
- * @protected
- */
- function removeWiki($text) {
- $fname = __METHOD__;
- wfProfileIn( $fname );
-
- //$text = preg_replace("/'{2,5}/", "", $text);
- //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
- //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
- //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
- //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
- //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
- $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
- $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
- $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
- $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
- //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
- $text = preg_replace("/<\/?[^>]+>/", "", $text);
- $text = preg_replace("/'''''/", "", $text);
- $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
- $text = preg_replace("/''/", "", $text);
-
- wfProfileOut( $fname );
- return $text;
- }
-
- /**
- * callback to replace [[target|caption]] kind of links, if
- * the target is category or image, leave it
- *
- * @param array $matches
- */
- function linkReplace($matches){
- $colon = strpos( $matches[1], ':' );
- if( $colon === false )
- return $matches[2]; // replace with caption
- global $wgContLang;
- $ns = substr( $matches[1], 0, $colon );
- $index = $wgContLang->getNsIndex($ns);
- if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) )
- return $matches[0]; // return the whole thing
- else
- return $matches[2];
-
- }
-}
-