From 5fe63300af20c9e8b169ca105456ee0acf190c37 Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Sun, 1 Aug 2004 20:43:54 +0000 Subject: [PATCH] Add Language::truncate() method for truncating strings to a certain byte length. UTF-8 version ensures that only whole characters are included. Fix for: [ 1001502 ] Search preview splits up multi-byte UTF-8 characters Todo: [ 855680 ] Broken UTF-8 cutoff breaks display in some browsers --- includes/SearchEngine.php | 35 ++++++++++++---------------- languages/Language.php | 47 ++++++++++++++++++++++++++++---------- languages/LanguageUtf8.php | 38 ++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 33 deletions(-) diff --git a/includes/SearchEngine.php b/includes/SearchEngine.php index 8c110460ff..252725614e 100644 --- a/includes/SearchEngine.php +++ b/includes/SearchEngine.php @@ -372,7 +372,7 @@ class SearchEngine { function showHit( $row ) { - global $wgUser, $wgOut; + global $wgUser, $wgOut, $wgLang; $t = Title::makeName( $row->cur_namespace, $row->cur_title ); $sk = $wgUser->getSkin(); @@ -391,33 +391,26 @@ class SearchEngine { $lineno = 0; foreach ( $lines as $line ) { - if ( 0 == $contextlines ) { break; } + if ( 0 == $contextlines ) { + break; + } --$contextlines; ++$lineno; - if ( ! preg_match( $pat1, $line, $m ) ) { continue; } - - $pre = $m[1]; - if ( 0 == $contextchars ) { $pre = "..."; } - else { - if ( strlen( $pre ) > $contextchars ) { - $pre = "..." . substr( $pre, -$contextchars ); - } + if ( ! preg_match( $pat1, $line, $m ) ) { + continue; } - $pre = wfEscapeHTML( $pre ); - if ( count( $m ) < 3 ) { $post = ""; } - else { $post = $m[3]; } + $pre = $wgLang->truncate( $m[1], -$contextchars, "..." ); - if ( 0 == $contextchars ) { $post = "..."; } - else { - if ( strlen( $post ) > $contextchars ) { - $post = substr( $post, 0, $contextchars ) . "..."; - } + if ( count( $m ) < 3 ) { + $post = ""; + } else { + $post = $wgLang->truncate( $m[3], $contextchars, "..." ); } - $post = wfEscapeHTML( $post ); - $found = wfEscapeHTML( $m[2] ); - $line = "{$pre}{$found}{$post}"; + $found = $m[2]; + + $line = htmlspecialchars( $pre . $found . $post ); $pat2 = "/(" . implode( "|", $this->mSearchterms ) . ")/i"; $line = preg_replace( $pat2, "\\1", $line ); diff --git a/languages/Language.php b/languages/Language.php index 2fa2158f42..e5ee0d614e 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -1841,19 +1841,42 @@ class Language { return $number; } - function listToText( $l ) { - $s = ''; - $m = count($l) - 1; - for ($i = $m; $i >= 0; $i--) { - if ($i == $m) { - $s = $l[$i]; - } else if ($i == $m - 1) { - $s = $l[$i] . ' ' . $this->getMessage('and') . ' ' . $s; - } else { - $s = $l[$i] . ', ' . $s; - } + function listToText( $l ) { + $s = ''; + $m = count($l) - 1; + for ($i = $m; $i >= 0; $i--) { + if ($i == $m) { + $s = $l[$i]; + } else if ($i == $m - 1) { + $s = $l[$i] . ' ' . $this->getMessage('and') . ' ' . $s; + } else { + $s = $l[$i] . ', ' . $s; + } + } + return $s; + } + + # Crop a string from the beginning or end to a certain number of bytes. + # (Bytes are used because our storage has limited byte lengths for some + # columns in the database.) Multibyte charsets will need to make sure that + # only whole characters are included! + # + # $length does not include the optional ellipsis. + # If $length is negative, snip from the beginning + function truncate( $string, $length, $ellipsis = "" ) { + if( $length == 0 ) { + return $ellipsis; + } + if ( strlen( $string ) <= abs( $length ) ) { + return $string; + } + if( $length > 0 ) { + $string = substr( $string, 0, $length ); + return $string . $ellipsis; + } else { + $string = substr( $string, $length ); + return $ellipsis . $string; } - return $s; } } diff --git a/languages/LanguageUtf8.php b/languages/LanguageUtf8.php index c85530535d..22c7832f74 100644 --- a/languages/LanguageUtf8.php +++ b/languages/LanguageUtf8.php @@ -71,6 +71,44 @@ class LanguageUtf8 extends Language { return isset( $matches[1] ) ? $matches[1] : ""; } + + # Crop a string from the beginning or end to a certain number of bytes. + # (Bytes are used because our storage has limited byte lengths for some + # columns in the database.) Multibyte charsets will need to make sure that + # only whole characters are included! + # + # $length does not include the optional ellipsis. + # If $length is negative, snip from the beginning + function truncate( $string, $length, $ellipsis = "" ) { + if( $length == 0 ) { + return $ellipsis; + } + if ( strlen( $string ) <= abs( $length ) ) { + return $string; + } + if( $length > 0 ) { + $string = substr( $string, 0, $length ); + $char = ord( $string[strlen( $string ) - 1] ); + if ($char >= 0xc0) { + # We got the first byte only of a multibyte char; remove it. + $string = substr( $string, 0, -1 ); + } elseif( $char >= 0x80 && + preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' . + '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) { + # We chopped in the middle of a character; remove it + $string = $m[1]; + } + return $string . $ellipsis; + } else { + $string = substr( $string, $length ); + $char = ord( $string[0] ); + if( $char >= 0x80 && $char < 0xc0 ) { + # We chopped in the middle of a character; remove the whole thing + $string = preg_replace( '/^[\x80-\xbf]+/', '', $string ); + } + return $ellipsis . $string; + } + } } } # ifdef MEDIAWIKI -- 2.20.1