* Moved truncateHtml() to language.php
authorAaron Schulz <aaron@users.mediawiki.org>
Wed, 24 Feb 2010 04:14:45 +0000 (04:14 +0000)
committerAaron Schulz <aaron@users.mediawiki.org>
Wed, 24 Feb 2010 04:14:45 +0000 (04:14 +0000)
* Renamed $maxLen -> $length
* Made $length=0 case match truncate()

languages/Language.php

index 1afc8a1..00b0dad 100644 (file)
@@ -2155,42 +2155,26 @@ class Language {
         */
        function truncate( $string, $length, $ellipsis = '...' ) {
                # Use the localized ellipsis character
-               if( $ellipsis == '...' ) {
+               if ( $ellipsis == '...' ) {
                        $ellipsis = wfMsgExt( 'ellipsis', array( 'escapenoentities', 'language' => $this ) );
                }
-
-               if( $length == 0 ) {
+               # Check if there is no need to truncate
+               if ( $length == 0 ) {
                        return $ellipsis;
-               }
-               if ( strlen( $string ) <= abs( $length ) ) {
+               } elseif ( strlen( $string ) <= abs( $length ) ) {
                        return $string;
                }
                $stringOriginal = $string;
-               if( $length > 0 ) {
-                       $string = substr( $string, 0, $length );
-                       $char = ord( $string[strlen( $string ) - 1] );
-                       $m = array();
-                       if ($char >= 0xc0) {
-                               # We got the first byte only of a multibyte char; remove it.
-                               $string = substr( $string, 0, -1 );
-                       } elseif( $char >= 0x80 &&
-                                 preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
-                                             '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
-                               # We chopped in the middle of a character; remove it
-                               $string = $m[1];
-                       }
+               if ( $length > 0 ) {
+                       $string = substr( $string, 0, $length ); // xyz...
+                       $string = self::removeBadCharLast( $string );
                        $string = $string . $ellipsis;
-
                } else {
-                       $string = substr( $string, $length );
-                       $char = ord( $string[0] );
-                       if( $char >= 0x80 && $char < 0xc0 ) {
-                               # We chopped in the middle of a character; remove the whole thing
-                               $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
-                       }
+                       $string = substr( $string, $length ); // ...xyz
+                       $string = self::removeBadCharFirst( $string );
                        $string = $ellipsis . $string;
                }
-               # Do not truncate if the ellipsis actually make the string longer. Bug 22181
+               # Do not truncate if the ellipsis makes the string longer (bug 22181)
                if ( strlen( $string ) < strlen( $stringOriginal ) ) {
                        return $string;
                } else {
@@ -2198,6 +2182,176 @@ class Language {
                }
        }
 
+       /**
+        * Remove bytes that represent an incomplete Unicode character
+        * at the end of string (e.g. bytes of the char are missing)
+        *
+        * @param $string String
+        * @return string
+        */
+       protected function removeBadCharLast( $string ) {
+               $char = ord( $string[strlen( $string ) - 1] );
+               $m = array();
+               if ( $char >= 0xc0 ) {
+                       # We got the first byte only of a multibyte char; remove it.
+                       $string = substr( $string, 0, -1 );
+               } elseif ( $char >= 0x80 &&
+                     preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
+                                 '[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) )
+               {
+                       # We chopped in the middle of a character; remove it
+                       $string = $m[1];
+               }
+               return $string;
+       }
+
+       /**
+        * Remove bytes that represent an incomplete Unicode character
+        * at the start of string (e.g. bytes of the char are missing)
+        *
+        * @param $string String
+        * @return string
+        */
+       protected function removeBadCharFirst( $string ) {
+               $char = ord( $string[0] );
+               if ( $char >= 0x80 && $char < 0xc0 ) {
+                       # We chopped in the middle of a character; remove the whole thing
+                       $string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
+               }
+               return $string;
+       }
+
+       /*
+        * Truncate a string of valid HTML to a specified length in bytes,
+        * appending an optional string (e.g. for ellipses), and return valid HTML
+        *
+        * This is only intended for styled/linked text, such as HTML with
+        * tags like <span> and <a>, were the tags are self-contained (valid HTML)
+        *
+        * Note: tries to fix broken HTML with MWTidy
+        *
+        * @param string $text String to truncate
+        * @param int $length (zero/positive) Maximum length (excluding ellipses)
+        * @param string $ellipsis String to append to the truncated text
+        * @returns string
+        */
+       function truncateHtml( $text, $length, $ellipsis = '...' ) {
+               # Use the localized ellipsis character
+               if ( $ellipsis == '...' ) {
+                       $ellipsis = wfMsgExt( 'ellipsis', array( 'escapenoentities', 'language' => $this ) );
+               }
+               # Check if there is no need to truncate
+               if ( $length <= 0 ) {
+                       return $ellipsis; // no text shown, nothing to format
+               } elseif ( strlen($text) <= $length ) {
+                       return $text; // string short enough even *with* HTML
+               }
+               $text = MWTidy::tidy( $text ); // fix tags
+               $displayLen = 0; // innerHTML legth so far
+               $doTruncate = true; // truncated string plus '...' shorter than original?
+               $tagType = 0; // 0-open, 1-close
+               $bracketState = 0; // 1-tag start, 2-tag name, 0-neither
+               $entityState = 0; // 0-not entity, 1-entity
+               $tag = $ret = $ch = '';
+               $openTags = array();
+               $textLen = strlen($text);
+               for( $pos = 0; $pos < $textLen; ++$pos ) {
+                       $ch = $text[$pos];
+                       $lastCh = $pos ? $text[$pos-1] : '';
+                       $ret .= $ch; // add to result string
+                       if ( $ch == '<' ) {
+                               self::onEndBracket( $tag, $tagType, $lastCh, $openTags ); // for bad HTML
+                               $entityState = 0; // for bad HTML
+                               $bracketState = 1; // tag started (checking for backslash)
+                       } elseif ( $ch == '>' ) {
+                               self::onEndBracket( $tag, $tagType, $lastCh, $openTags );
+                               $entityState = 0; // for bad HTML
+                               $bracketState = 0; // out of brackets
+                       } elseif ( $bracketState == 1 ) {
+                               if ( $ch == '/' ) {
+                                       $tagType = 1; // close tag (e.g. "</span>")
+                               } else {
+                                       $tagType = 0; // open tag (e.g. "<span>")
+                                       $tag .= $ch;
+                               }
+                               $bracketState = 2; // building tag name
+                       } elseif ( $bracketState == 2 ) {
+                               if ( $ch != ' ' ) {
+                                       $tag .= $ch;
+                               } else {
+                                       // Name found (e.g. "<a href=..."), add on tag attributes...
+                                       $pos += self::skipAndAppend( $ret, $text, "<>", $pos + 1 );
+                               }
+                       } elseif ( $bracketState == 0 ) {
+                               if ( $entityState ) {
+                                       if ( $ch == ';' ) {
+                                               $entityState = 0;
+                                               $displayLen++; // entity is one displayed char
+                                       }
+                               } else {
+                                       if ( $ch == '&' ) {
+                                               $entityState = 1; // entity found, (e.g. "&nbsp;")
+                                       } else {
+                                               $displayLen++; // this char is displayed
+                                               // Add on the other display text after this...
+                                               $skipped = self::skipAndAppend(
+                                                       $ret, $text, "<>&", $pos + 1, $length - $displayLen );
+                                               $displayLen += $skipped;
+                                               $pos += $skipped;
+                                       }
+                               }
+                       }
+                       if( !$doTruncate ) continue;
+                       # Truncate if not in the middle of a bracket/entity...
+                       if ( $displayLen >= $length && $bracketState == 0 && $entityState == 0 ) {
+                               $remaining = substr( $text, $pos + 1 ); // remaining string
+                               $remaining = StringUtils::delimiterReplace( '<', '>', '', $remaining ); // rm tags
+                               $remaining = StringUtils::delimiterReplace( '&', ';', '', $remaining ); // rm entities
+                               $doTruncate = ( strlen($remaining) > strlen($ellipsis) );
+                               if ( $doTruncate ) {
+                                       $ret = self::removeBadCharLast( $ret ) . $ellipsis;
+                                       break;
+                               }
+                       }
+               }
+               if ( $displayLen == 0 ) {
+                       return ''; // no text shown, nothing to format
+               }
+               self::onEndBracket( $tag, $text[$textLen-1], $tagType, $openTags ); // for bad HTML
+               while ( count( $openTags ) > 0 ) {
+                       $ret .= '</' . array_pop( $openTags ) . '>'; // close open tags
+               }
+               return $ret;
+       }
+
+       // truncateHtml() helper function
+       // like strcspn() but adds the skipped chars to $ret
+       private function skipAndAppend( &$ret, $text, $search, $start, $len = -1 ) {
+               $skipCount = 0;
+               if( $start < strlen($text) ) {
+                       $skipCount = strcspn( $text, $search, $start, $len );
+                       $ret .= substr( $text, $start, $skipCount );
+               }
+               return $skipCount;
+       }
+
+       // truncateHtml() helper function
+       // (a) push or pop $tag from $openTags as needed
+       // (b) clear $tag value
+       private function onEndBracket( &$tag, $tagType, $lastCh, &$openTags ) {
+               $tag = ltrim( $tag );
+               if( $tag != '' ) {
+                       if( $tagType == 0 && $lastCh != '/' ) {
+                               $openTags[] = $tag; // tag opened (didn't close itself)
+                       } else if( $tagType == 1 ) {
+                               if( $openTags && $tag == $openTags[count($openTags)-1] ) {
+                                       array_pop( $openTags ); // tag closed
+                               }
+                       }
+                       $tag = '';
+               }
+       }
+
        /**
         * Grammatical transformations, needed for inflected languages
         * Invoked by putting {{grammar:case|word}} in a message