resources/src/mediawiki.String.js

   1 ( function () {
   2
   3         /**
   4          * @class mw.String
   5          * @singleton
   6          */
   7
   8         /**
   9          * Calculate the byte length of a string (accounting for UTF-8).
  10          *
  11          * @author Jan Paul Posma, 2011
  12          * @author Timo Tijhof, 2012
  13          * @author David Chan, 2013
  14          *
  15          * @param {string} str
  16          * @return {number}
  17          */
  18         function byteLength( str ) {
  19                 // This basically figures out how many bytes a UTF-16 string (which is what js sees)
  20                 // will take in UTF-8 by replacing a 2 byte character with 2 *'s, etc, and counting that.
  21                 // Note, surrogate (\uD800-\uDFFF) characters are counted as 2 bytes, since there's two of them
  22                 // and the actual character takes 4 bytes in UTF-8 (2*2=4). Might not work perfectly in
  23                 // edge cases such as illegal sequences, but that should never happen.
  24
  25                 // https://en.wikipedia.org/wiki/UTF-8#Description
  26                 // The mapping from UTF-16 code units to UTF-8 bytes is as follows:
  27                 // > Range 0000-007F: codepoints that become 1 byte of UTF-8
  28                 // > Range 0080-07FF: codepoints that become 2 bytes of UTF-8
  29                 // > Range 0800-D7FF: codepoints that become 3 bytes of UTF-8
  30                 // > Range D800-DFFF: Surrogates (each pair becomes 4 bytes of UTF-8)
  31                 // > Range E000-FFFF: codepoints that become 3 bytes of UTF-8 (continued)
  32
  33                 return str
  34                         .replace( /[\u0080-\u07FF\uD800-\uDFFF]/g, '**' )
  35                         .replace( /[\u0800-\uD7FF\uE000-\uFFFF]/g, '***' )
  36                         .length;
  37         }
  38
  39         /**
  40          * Calculate the character length of a string (accounting for UTF-16 surrogates).
  41          *
  42          * @param {string} str
  43          * @return {number}
  44          */
  45         function codePointLength( str ) {
  46                 return str
  47                         // Low surrogate + high surrogate pairs represent one character (codepoint) each
  48                         .replace( /[\uD800-\uDBFF][\uDC00-\uDFFF]/g, '*' )
  49                         .length;
  50         }
  51
  52         // Like String#charAt, but return the pair of UTF-16 surrogates for characters outside of BMP.
  53         function codePointAt( string, offset, backwards ) {
  54                 // We don't need to check for offsets at the beginning or end of string,
  55                 // String#slice will simply return a shorter (or empty) substring.
  56                 var maybePair = backwards ?
  57                         string.slice( offset - 1, offset + 1 ) :
  58                         string.slice( offset, offset + 2 );
  59                 if ( /^[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( maybePair ) ) {
  60                         return maybePair;
  61                 } else {
  62                         return string.charAt( offset );
  63                 }
  64         }
  65
  66         function trimLength( safeVal, newVal, length, lengthFn ) {
  67                 var startMatches, endMatches, matchesLen, inpParts, chopOff, oldChar, newChar,
  68                         oldVal = safeVal;
  69
  70                 // Run the hook if one was provided, but only on the length
  71                 // assessment. The value itself is not to be affected by the hook.
  72                 if ( lengthFn( newVal ) <= length ) {
  73                         // Limit was not reached, just remember the new value
  74                         // and let the user continue.
  75                         return {
  76                                 newVal: newVal,
  77                                 trimmed: false
  78                         };
  79                 }
  80
  81                 // Current input is longer than the active limit.
  82                 // Figure out what was added and limit the addition.
  83                 startMatches = 0;
  84                 endMatches = 0;
  85
  86                 // It is important that we keep the search within the range of
  87                 // the shortest string's length.
  88                 // Imagine a user adds text that matches the end of the old value
  89                 // (e.g. "foo" -> "foofoo"). startMatches would be 3, but without
  90                 // limiting both searches to the shortest length, endMatches would
  91                 // also be 3.
  92                 matchesLen = Math.min( newVal.length, oldVal.length );
  93
  94                 // Count same characters from the left, first.
  95                 // (if "foo" -> "foofoo", assume addition was at the end).
  96                 while ( startMatches < matchesLen ) {
  97                         oldChar = codePointAt( oldVal, startMatches, false );
  98                         newChar = codePointAt( newVal, startMatches, false );
  99                         if ( oldChar !== newChar ) {
 100                                 break;
 101                         }
 102                         startMatches += oldChar.length;
 103                 }
 104
 105                 while ( endMatches < ( matchesLen - startMatches ) ) {
 106                         oldChar = codePointAt( oldVal, oldVal.length - 1 - endMatches, true );
 107                         newChar = codePointAt( newVal, newVal.length - 1 - endMatches, true );
 108                         if ( oldChar !== newChar ) {
 109                                 break;
 110                         }
 111                         endMatches += oldChar.length;
 112                 }
 113
 114                 inpParts = [
 115                         // Same start
 116                         newVal.slice( 0, startMatches ),
 117                         // Inserted content
 118                         newVal.slice( startMatches, newVal.length - endMatches ),
 119                         // Same end
 120                         newVal.slice( newVal.length - endMatches )
 121                 ];
 122
 123                 // Chop off characters from the end of the "inserted content" string
 124                 // until the limit is statisfied.
 125                 // Make sure to stop when there is nothing to slice (T43450).
 126                 while ( lengthFn( inpParts.join( '' ) ) > length && inpParts[ 1 ].length > 0 ) {
 127                         // Do not chop off halves of surrogate pairs
 128                         chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1;
 129                         inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff );
 130                 }
 131
 132                 return {
 133                         newVal: inpParts.join( '' ),
 134                         // For pathological lengthFn() that always returns a length greater than the limit, we might have
 135                         // ended up not trimming - check for this case to avoid infinite loops
 136                         trimmed: newVal !== inpParts.join( '' )
 137                 };
 138         }
 139
 140         /**
 141          * Utility function to trim down a string, based on byteLimit
 142          * and given a safe start position. It supports insertion anywhere
 143          * in the string, so "foo" to "fobaro" if limit is 4 will result in
 144          * "fobo", not "foba". Basically emulating the native maxlength by
 145          * reconstructing where the insertion occurred.
 146          *
 147          * @param {string} safeVal Known value that was previously returned by this
 148          * function, if none, pass empty string.
 149          * @param {string} newVal New value that may have to be trimmed down.
 150          * @param {number} byteLimit Number of bytes the value may be in size.
 151          * @param {Function} [filterFn] Function to call on the string before assessing the length.
 152          * @return {Object}
 153          * @return {string} return.newVal
 154          * @return {boolean} return.trimmed
 155          */
 156         function trimByteLength( safeVal, newVal, byteLimit, filterFn ) {
 157                 var lengthFn;
 158                 if ( filterFn ) {
 159                         lengthFn = function ( val ) {
 160                                 return byteLength( filterFn( val ) );
 161                         };
 162                 } else {
 163                         lengthFn = byteLength;
 164                 }
 165
 166                 return trimLength( safeVal, newVal, byteLimit, lengthFn );
 167         }
 168
 169         /**
 170          * Utility function to trim down a string, based on codePointLimit
 171          * and given a safe start position. It supports insertion anywhere
 172          * in the string, so "foo" to "fobaro" if limit is 4 will result in
 173          * "fobo", not "foba". Basically emulating the native maxlength by
 174          * reconstructing where the insertion occurred.
 175          *
 176          * @param {string} safeVal Known value that was previously returned by this
 177          * function, if none, pass empty string.
 178          * @param {string} newVal New value that may have to be trimmed down.
 179          * @param {number} codePointLimit Number of characters the value may be in size.
 180          * @param {Function} [filterFn] Function to call on the string before assessing the length.
 181          * @return {Object}
 182          * @return {string} return.newVal
 183          * @return {boolean} return.trimmed
 184          */
 185         function trimCodePointLength( safeVal, newVal, codePointLimit, filterFn ) {
 186                 var lengthFn;
 187                 if ( filterFn ) {
 188                         lengthFn = function ( val ) {
 189                                 return codePointLength( filterFn( val ) );
 190                         };
 191                 } else {
 192                         lengthFn = codePointLength;
 193                 }
 194
 195                 return trimLength( safeVal, newVal, codePointLimit, lengthFn );
 196         }
 197
 198         module.exports = {
 199                 byteLength: byteLength,
 200                 codePointLength: codePointLength,
 201                 trimByteLength: trimByteLength,
 202                 trimCodePointLength: trimCodePointLength
 203         };
 204
 205 }() );