From 5fe63300af20c9e8b169ca105456ee0acf190c37 Mon Sep 17 00:00:00 2001
From: Brion Vibber <brion@users.mediawiki.org>
Date: Sun, 1 Aug 2004 20:43:54 +0000
Subject: [PATCH] Add Language::truncate() method for truncating strings to a
 certain byte length. UTF-8 version ensures that only whole characters are
 included.

Fix for: [ 1001502 ] Search preview splits up multi-byte UTF-8 characters
Todo: [ 855680 ] Broken UTF-8 cutoff breaks display in some browsers
---
 includes/SearchEngine.php  | 35 ++++++++++++----------------
 languages/Language.php     | 47 ++++++++++++++++++++++++++++----------
 languages/LanguageUtf8.php | 38 ++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 33 deletions(-)
diff --git a/includes/SearchEngine.php b/includes/SearchEngine.php
index 8c110460ff..252725614e 100644
--- a/includes/SearchEngine.php
+++ b/includes/SearchEngine.php
@@ -372,7 +372,7 @@ class SearchEngine {
 
 	function showHit( $row )
 	{
-		global $wgUser, $wgOut;
+		global $wgUser, $wgOut, $wgLang;
 
 		$t = Title::makeName( $row->cur_namespace, $row->cur_title );
 		$sk = $wgUser->getSkin();
@@ -391,33 +391,26 @@ class SearchEngine {
 		$lineno = 0;
 
 		foreach ( $lines as $line ) {
-			if ( 0 == $contextlines ) { break; }
+			if ( 0 == $contextlines ) {
+				break;
+			}
 			--$contextlines;
 			++$lineno;
-			if ( ! preg_match( $pat1, $line, $m ) ) { continue; }
-
-			$pre = $m[1];
-			if ( 0 == $contextchars ) { $pre = "..."; }
-			else {
-				if ( strlen( $pre ) > $contextchars ) {
-					$pre = "..." . substr( $pre, -$contextchars );
-				}
+			if ( ! preg_match( $pat1, $line, $m ) ) {
+				continue;
 			}
-			$pre = wfEscapeHTML( $pre );
 
-			if ( count( $m ) < 3 ) { $post = ""; }
-			else { $post = $m[3]; }
+			$pre = $wgLang->truncate( $m[1], -$contextchars, "..." );
 
-			if ( 0 == $contextchars ) { $post = "..."; }
-			else {
-				if ( strlen( $post ) > $contextchars ) {
-					$post = substr( $post, 0, $contextchars ) . "...";
-				}
+			if ( count( $m ) < 3 ) {
+				$post = "";
+			} else {
+				$post = $wgLang->truncate( $m[3], $contextchars, "..." );
 			}
-			$post = wfEscapeHTML( $post );
-			$found = wfEscapeHTML( $m[2] );
 
-			$line = "{$pre}{$found}{$post}";
+			$found = $m[2];
+
+			$line = htmlspecialchars( $pre . $found . $post );
 			$pat2 = "/(" . implode( "|", $this->mSearchterms ) . ")/i";
 			$line = preg_replace( $pat2,
 			  "<font color='red'>\\1</font>", $line );
diff --git a/languages/Language.php b/languages/Language.php
index 2fa2158f42..e5ee0d614e 100644
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -1841,19 +1841,42 @@ class Language {
 		return $number;
 	}
 
-        function listToText( $l ) {
-	        $s = '';
-	        $m = count($l) - 1;
-	        for ($i = $m; $i >= 0; $i--) {
-		    if ($i == $m) {
-			$s = $l[$i];
-		    } else if ($i == $m - 1) {
-			$s = $l[$i] . ' ' . $this->getMessage('and') . ' ' . $s;
-		    } else {
-			$s = $l[$i] . ', ' . $s;
-		    }
+	function listToText( $l ) {
+		$s = '';
+		$m = count($l) - 1;
+		for ($i = $m; $i >= 0; $i--) {
+			if ($i == $m) {
+				$s = $l[$i];
+			} else if ($i == $m - 1) {
+				$s = $l[$i] . ' ' . $this->getMessage('and') . ' ' . $s;
+			} else {
+				$s = $l[$i] . ', ' . $s;
+			}
+		}
+		return $s;
+	}
+	
+	# Crop a string from the beginning or end to a certain number of bytes.
+	# (Bytes are used because our storage has limited byte lengths for some
+	# columns in the database.) Multibyte charsets will need to make sure that
+	# only whole characters are included!
+	#
+	# $length does not include the optional ellipsis.
+	# If $length is negative, snip from the beginning
+	function truncate( $string, $length, $ellipsis = "" ) {
+		if( $length == 0 ) {
+			return $ellipsis;
+		}
+		if ( strlen( $string ) <= abs( $length ) ) {
+			return $string;
+		}
+		if( $length > 0 ) {
+			$string = substr( $string, 0, $length );
+			return $string . $ellipsis;
+		} else {
+			$string = substr( $string, $length );
+			return $ellipsis . $string;
 		}
-	        return $s;
 	}
 }
 
diff --git a/languages/LanguageUtf8.php b/languages/LanguageUtf8.php
index c85530535d..22c7832f74 100644
--- a/languages/LanguageUtf8.php
+++ b/languages/LanguageUtf8.php
@@ -71,6 +71,44 @@ class LanguageUtf8 extends Language {
 		
 		return isset( $matches[1] ) ? $matches[1] : "";
 	}
+
+	# Crop a string from the beginning or end to a certain number of bytes.
+	# (Bytes are used because our storage has limited byte lengths for some
+	# columns in the database.) Multibyte charsets will need to make sure that
+	# only whole characters are included!
+	#
+	# $length does not include the optional ellipsis.
+	# If $length is negative, snip from the beginning
+	function truncate( $string, $length, $ellipsis = "" ) {
+		if( $length == 0 ) {
+			return $ellipsis;
+		}
+		if ( strlen( $string ) <= abs( $length ) ) {
+			return $string;
+		}
+		if( $length > 0 ) {
+			$string = substr( $string, 0, $length );
+			$char = ord( $string[strlen( $string ) - 1] );
+			if ($char >= 0xc0) {
+				# We got the first byte only of a multibyte char; remove it.
+				$string = substr( $string, 0, -1 );
+			} elseif( $char >= 0x80 &&
+			          preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
+			                      '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
+			    # We chopped in the middle of a character; remove it
+				$string = $m[1];
+			}
+			return $string . $ellipsis;
+		} else {
+			$string = substr( $string, $length );
+			$char = ord( $string[0] );
+			if( $char >= 0x80 && $char < 0xc0 ) {
+				# We chopped in the middle of a character; remove the whole thing
+				$string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
+			}
+			return $ellipsis . $string;
+		}
+	}
 }
 
 } # ifdef MEDIAWIKI
-- 
2.20.1