resources/src/mediawiki/mediawiki.String.js

   1 ( function () {
   2
   3         /**
   4          * @class mw.String
   5          * @singleton
   6          */
   7
   8         /**
   9          * Calculate the byte length of a string (accounting for UTF-8).
  10          *
  11          * @author Jan Paul Posma, 2011
  12          * @author Timo Tijhof, 2012
  13          * @author David Chan, 2013
  14          *
  15          * @param {string} str
  16          * @return {number}
  17          */
  18         function byteLength( str ) {
  19                 // This basically figures out how many bytes a UTF-16 string (which is what js sees)
  20                 // will take in UTF-8 by replacing a 2 byte character with 2 *'s, etc, and counting that.
  21                 // Note, surrogate (\uD800-\uDFFF) characters are counted as 2 bytes, since there's two of them
  22                 // and the actual character takes 4 bytes in UTF-8 (2*2=4). Might not work perfectly in
  23                 // edge cases such as illegal sequences, but that should never happen.
  24
  25                 // https://en.wikipedia.org/wiki/UTF-8#Description
  26                 // The mapping from UTF-16 code units to UTF-8 bytes is as follows:
  27                 // > Range 0000-007F: codepoints that become 1 byte of UTF-8
  28                 // > Range 0080-07FF: codepoints that become 2 bytes of UTF-8
  29                 // > Range 0800-D7FF: codepoints that become 3 bytes of UTF-8
  30                 // > Range D800-DFFF: Surrogates (each pair becomes 4 bytes of UTF-8)
  31                 // > Range E000-FFFF: codepoints that become 3 bytes of UTF-8 (continued)
  32
  33                 return str
  34                         .replace( /[\u0080-\u07FF\uD800-\uDFFF]/g, '**' )
  35                         .replace( /[\u0800-\uD7FF\uE000-\uFFFF]/g, '***' )
  36                         .length;
  37         }
  38
  39         // Like String#charAt, but return the pair of UTF-16 surrogates for characters outside of BMP.
  40         function codePointAt( string, offset, backwards ) {
  41                 // We don't need to check for offsets at the beginning or end of string,
  42                 // String#slice will simply return a shorter (or empty) substring.
  43                 var maybePair = backwards ?
  44                         string.slice( offset - 1, offset + 1 ) :
  45                         string.slice( offset, offset + 2 );
  46                 if ( /^[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( maybePair ) ) {
  47                         return maybePair;
  48                 } else {
  49                         return string.charAt( offset );
  50                 }
  51         }
  52
  53         /**
  54          * Utility function to trim down a string, based on byteLimit
  55          * and given a safe start position. It supports insertion anywhere
  56          * in the string, so "foo" to "fobaro" if limit is 4 will result in
  57          * "fobo", not "foba". Basically emulating the native maxlength by
  58          * reconstructing where the insertion occurred.
  59          *
  60          * @param {string} safeVal Known value that was previously returned by this
  61          * function, if none, pass empty string.
  62          * @param {string} newVal New value that may have to be trimmed down.
  63          * @param {number} byteLimit Number of bytes the value may be in size.
  64          * @param {Function} [fn] Function to call on the string before assessing the length.
  65          * @return {Object}
  66          * @return {string} return.newVal
  67          * @return {boolean} return.trimmed
  68          */
  69         function trimByteLength( safeVal, newVal, byteLimit, fn ) {
  70                 var startMatches, endMatches, matchesLen, inpParts, chopOff, oldChar, newChar,
  71                         oldVal = safeVal;
  72
  73                 // Run the hook if one was provided, but only on the length
  74                 // assessment. The value itself is not to be affected by the hook.
  75                 if ( byteLength( fn ? fn( newVal ) : newVal ) <= byteLimit ) {
  76                         // Limit was not reached, just remember the new value
  77                         // and let the user continue.
  78                         return {
  79                                 newVal: newVal,
  80                                 trimmed: false
  81                         };
  82                 }
  83
  84                 // Current input is longer than the active limit.
  85                 // Figure out what was added and limit the addition.
  86                 startMatches = 0;
  87                 endMatches = 0;
  88
  89                 // It is important that we keep the search within the range of
  90                 // the shortest string's length.
  91                 // Imagine a user adds text that matches the end of the old value
  92                 // (e.g. "foo" -> "foofoo"). startMatches would be 3, but without
  93                 // limiting both searches to the shortest length, endMatches would
  94                 // also be 3.
  95                 matchesLen = Math.min( newVal.length, oldVal.length );
  96
  97                 // Count same characters from the left, first.
  98                 // (if "foo" -> "foofoo", assume addition was at the end).
  99                 while ( startMatches < matchesLen ) {
 100                         oldChar = codePointAt( oldVal, startMatches, false );
 101                         newChar = codePointAt( newVal, startMatches, false );
 102                         if ( oldChar !== newChar ) {
 103                                 break;
 104                         }
 105                         startMatches += oldChar.length;
 106                 }
 107
 108                 while ( endMatches < ( matchesLen - startMatches ) ) {
 109                         oldChar = codePointAt( oldVal, oldVal.length - 1 - endMatches, true );
 110                         newChar = codePointAt( newVal, newVal.length - 1 - endMatches, true );
 111                         if ( oldChar !== newChar ) {
 112                                 break;
 113                         }
 114                         endMatches += oldChar.length;
 115                 }
 116
 117                 inpParts = [
 118                         // Same start
 119                         newVal.slice( 0, startMatches ),
 120                         // Inserted content
 121                         newVal.slice( startMatches, newVal.length - endMatches ),
 122                         // Same end
 123                         newVal.slice( newVal.length - endMatches )
 124                 ];
 125
 126                 // Chop off characters from the end of the "inserted content" string
 127                 // until the limit is statisfied.
 128                 if ( fn ) {
 129                         // stop, when there is nothing to slice - T43450
 130                         while ( byteLength( fn( inpParts.join( '' ) ) ) > byteLimit && inpParts[ 1 ].length > 0 ) {
 131                                 // Do not chop off halves of surrogate pairs
 132                                 chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1;
 133                                 inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff );
 134                         }
 135                 } else {
 136                         while ( byteLength( inpParts.join( '' ) ) > byteLimit ) {
 137                                 // Do not chop off halves of surrogate pairs
 138                                 chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1;
 139                                 inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff );
 140                         }
 141                 }
 142
 143                 return {
 144                         newVal: inpParts.join( '' ),
 145                         // For pathological fn() that always returns a value longer than the limit, we might have
 146                         // ended up not trimming - check for this case to avoid infinite loops
 147                         trimmed: newVal !== inpParts.join( '' )
 148                 };
 149         }
 150
 151         module.exports = {
 152                 byteLength: byteLength,
 153                 trimByteLength: trimByteLength
 154         };
 155
 156 }() );