From e834c0b9b276d33a219f5f91462105fc560997b4 Mon Sep 17 00:00:00 2001 From: Aaron Schulz Date: Wed, 24 Feb 2010 04:14:45 +0000 Subject: [PATCH] * Moved truncateHtml() to language.php * Renamed $maxLen -> $length * Made $length=0 case match truncate() --- languages/Language.php | 206 +++++++++++++++++++++++++++++++++++------ 1 file changed, 180 insertions(+), 26 deletions(-) diff --git a/languages/Language.php b/languages/Language.php index 1afc8a1789..00b0dad280 100644 --- a/languages/Language.php +++ b/languages/Language.php @@ -2155,42 +2155,26 @@ class Language { */ function truncate( $string, $length, $ellipsis = '...' ) { # Use the localized ellipsis character - if( $ellipsis == '...' ) { + if ( $ellipsis == '...' ) { $ellipsis = wfMsgExt( 'ellipsis', array( 'escapenoentities', 'language' => $this ) ); } - - if( $length == 0 ) { + # Check if there is no need to truncate + if ( $length == 0 ) { return $ellipsis; - } - if ( strlen( $string ) <= abs( $length ) ) { + } elseif ( strlen( $string ) <= abs( $length ) ) { return $string; } $stringOriginal = $string; - if( $length > 0 ) { - $string = substr( $string, 0, $length ); - $char = ord( $string[strlen( $string ) - 1] ); - $m = array(); - if ($char >= 0xc0) { - # We got the first byte only of a multibyte char; remove it. - $string = substr( $string, 0, -1 ); - } elseif( $char >= 0x80 && - preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' . - '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) { - # We chopped in the middle of a character; remove it - $string = $m[1]; - } + if ( $length > 0 ) { + $string = substr( $string, 0, $length ); // xyz... + $string = self::removeBadCharLast( $string ); $string = $string . $ellipsis; - } else { - $string = substr( $string, $length ); - $char = ord( $string[0] ); - if( $char >= 0x80 && $char < 0xc0 ) { - # We chopped in the middle of a character; remove the whole thing - $string = preg_replace( '/^[\x80-\xbf]+/', '', $string ); - } + $string = substr( $string, $length ); // ...xyz + $string = self::removeBadCharFirst( $string ); $string = $ellipsis . $string; } - # Do not truncate if the ellipsis actually make the string longer. Bug 22181 + # Do not truncate if the ellipsis makes the string longer (bug 22181) if ( strlen( $string ) < strlen( $stringOriginal ) ) { return $string; } else { @@ -2198,6 +2182,176 @@ class Language { } } + /** + * Remove bytes that represent an incomplete Unicode character + * at the end of string (e.g. bytes of the char are missing) + * + * @param $string String + * @return string + */ + protected function removeBadCharLast( $string ) { + $char = ord( $string[strlen( $string ) - 1] ); + $m = array(); + if ( $char >= 0xc0 ) { + # We got the first byte only of a multibyte char; remove it. + $string = substr( $string, 0, -1 ); + } elseif ( $char >= 0x80 && + preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' . + '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) + { + # We chopped in the middle of a character; remove it + $string = $m[1]; + } + return $string; + } + + /** + * Remove bytes that represent an incomplete Unicode character + * at the start of string (e.g. bytes of the char are missing) + * + * @param $string String + * @return string + */ + protected function removeBadCharFirst( $string ) { + $char = ord( $string[0] ); + if ( $char >= 0x80 && $char < 0xc0 ) { + # We chopped in the middle of a character; remove the whole thing + $string = preg_replace( '/^[\x80-\xbf]+/', '', $string ); + } + return $string; + } + + /* + * Truncate a string of valid HTML to a specified length in bytes, + * appending an optional string (e.g. for ellipses), and return valid HTML + * + * This is only intended for styled/linked text, such as HTML with + * tags like and , were the tags are self-contained (valid HTML) + * + * Note: tries to fix broken HTML with MWTidy + * + * @param string $text String to truncate + * @param int $length (zero/positive) Maximum length (excluding ellipses) + * @param string $ellipsis String to append to the truncated text + * @returns string + */ + function truncateHtml( $text, $length, $ellipsis = '...' ) { + # Use the localized ellipsis character + if ( $ellipsis == '...' ) { + $ellipsis = wfMsgExt( 'ellipsis', array( 'escapenoentities', 'language' => $this ) ); + } + # Check if there is no need to truncate + if ( $length <= 0 ) { + return $ellipsis; // no text shown, nothing to format + } elseif ( strlen($text) <= $length ) { + return $text; // string short enough even *with* HTML + } + $text = MWTidy::tidy( $text ); // fix tags + $displayLen = 0; // innerHTML legth so far + $doTruncate = true; // truncated string plus '...' shorter than original? + $tagType = 0; // 0-open, 1-close + $bracketState = 0; // 1-tag start, 2-tag name, 0-neither + $entityState = 0; // 0-not entity, 1-entity + $tag = $ret = $ch = ''; + $openTags = array(); + $textLen = strlen($text); + for( $pos = 0; $pos < $textLen; ++$pos ) { + $ch = $text[$pos]; + $lastCh = $pos ? $text[$pos-1] : ''; + $ret .= $ch; // add to result string + if ( $ch == '<' ) { + self::onEndBracket( $tag, $tagType, $lastCh, $openTags ); // for bad HTML + $entityState = 0; // for bad HTML + $bracketState = 1; // tag started (checking for backslash) + } elseif ( $ch == '>' ) { + self::onEndBracket( $tag, $tagType, $lastCh, $openTags ); + $entityState = 0; // for bad HTML + $bracketState = 0; // out of brackets + } elseif ( $bracketState == 1 ) { + if ( $ch == '/' ) { + $tagType = 1; // close tag (e.g. "") + } else { + $tagType = 0; // open tag (e.g. "") + $tag .= $ch; + } + $bracketState = 2; // building tag name + } elseif ( $bracketState == 2 ) { + if ( $ch != ' ' ) { + $tag .= $ch; + } else { + // Name found (e.g. "", $pos + 1 ); + } + } elseif ( $bracketState == 0 ) { + if ( $entityState ) { + if ( $ch == ';' ) { + $entityState = 0; + $displayLen++; // entity is one displayed char + } + } else { + if ( $ch == '&' ) { + $entityState = 1; // entity found, (e.g. " ") + } else { + $displayLen++; // this char is displayed + // Add on the other display text after this... + $skipped = self::skipAndAppend( + $ret, $text, "<>&", $pos + 1, $length - $displayLen ); + $displayLen += $skipped; + $pos += $skipped; + } + } + } + if( !$doTruncate ) continue; + # Truncate if not in the middle of a bracket/entity... + if ( $displayLen >= $length && $bracketState == 0 && $entityState == 0 ) { + $remaining = substr( $text, $pos + 1 ); // remaining string + $remaining = StringUtils::delimiterReplace( '<', '>', '', $remaining ); // rm tags + $remaining = StringUtils::delimiterReplace( '&', ';', '', $remaining ); // rm entities + $doTruncate = ( strlen($remaining) > strlen($ellipsis) ); + if ( $doTruncate ) { + $ret = self::removeBadCharLast( $ret ) . $ellipsis; + break; + } + } + } + if ( $displayLen == 0 ) { + return ''; // no text shown, nothing to format + } + self::onEndBracket( $tag, $text[$textLen-1], $tagType, $openTags ); // for bad HTML + while ( count( $openTags ) > 0 ) { + $ret .= ''; // close open tags + } + return $ret; + } + + // truncateHtml() helper function + // like strcspn() but adds the skipped chars to $ret + private function skipAndAppend( &$ret, $text, $search, $start, $len = -1 ) { + $skipCount = 0; + if( $start < strlen($text) ) { + $skipCount = strcspn( $text, $search, $start, $len ); + $ret .= substr( $text, $start, $skipCount ); + } + return $skipCount; + } + + // truncateHtml() helper function + // (a) push or pop $tag from $openTags as needed + // (b) clear $tag value + private function onEndBracket( &$tag, $tagType, $lastCh, &$openTags ) { + $tag = ltrim( $tag ); + if( $tag != '' ) { + if( $tagType == 0 && $lastCh != '/' ) { + $openTags[] = $tag; // tag opened (didn't close itself) + } else if( $tagType == 1 ) { + if( $openTags && $tag == $openTags[count($openTags)-1] ) { + array_pop( $openTags ); // tag closed + } + } + $tag = ''; + } + } + /** * Grammatical transformations, needed for inflected languages * Invoked by putting {{grammar:case|word}} in a message -- 2.20.1