From 1954cff1d9ee8fb2645043a60bdddf507f59dc5a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?= Date: Mon, 5 Feb 2018 19:38:10 +0100 Subject: [PATCH] jquery.byteLimit: Handle characters outside BMP (surrogate pairs) when trimming Bug: T186364 Change-Id: I6282d97bcd637ae8e86d70996adb468582c8f02f --- resources/src/jquery/jquery.byteLimit.js | 48 ++++++++++++++----- .../resources/jquery/jquery.byteLimit.test.js | 42 +++++++++++++++- 2 files changed, 76 insertions(+), 14 deletions(-) diff --git a/resources/src/jquery/jquery.byteLimit.js b/resources/src/jquery/jquery.byteLimit.js index c75246c0f6..3ce6e7fca5 100644 --- a/resources/src/jquery/jquery.byteLimit.js +++ b/resources/src/jquery/jquery.byteLimit.js @@ -14,6 +14,20 @@ 'blur.byteLimit' ].join( ' ' ); + // Like String#charAt, but return the pair of UTF-16 surrogates for characters outside of BMP. + function codePointAt( string, offset, backwards ) { + // We don't need to check for offsets at the beginning or end of string, + // String#slice will simply return a shorter (or empty) substring. + var maybePair = backwards ? + string.slice( offset - 1, offset + 1 ) : + string.slice( offset, offset + 2 ); + if ( /^[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( maybePair ) ) { + return maybePair; + } else { + return string.charAt( offset ); + } + } + /** * Utility function to trim down a string, based on byteLimit * and given a safe start position. It supports insertion anywhere @@ -32,7 +46,7 @@ * @return {boolean} return.trimmed */ $.trimByteLength = function ( safeVal, newVal, byteLimit, fn ) { - var startMatches, endMatches, matchesLen, inpParts, + var startMatches, endMatches, matchesLen, inpParts, chopOff, oldChar, newChar, oldVal = safeVal; // Run the hook if one was provided, but only on the length @@ -61,18 +75,22 @@ // Count same characters from the left, first. // (if "foo" -> "foofoo", assume addition was at the end). - while ( - startMatches < matchesLen && - oldVal.charAt( startMatches ) === newVal.charAt( startMatches ) - ) { - startMatches += 1; + while ( startMatches < matchesLen ) { + oldChar = codePointAt( oldVal, startMatches, false ); + newChar = codePointAt( newVal, startMatches, false ); + if ( oldChar !== newChar ) { + break; + } + startMatches += oldChar.length; } - while ( - endMatches < ( matchesLen - startMatches ) && - oldVal.charAt( oldVal.length - 1 - endMatches ) === newVal.charAt( newVal.length - 1 - endMatches ) - ) { - endMatches += 1; + while ( endMatches < ( matchesLen - startMatches ) ) { + oldChar = codePointAt( oldVal, oldVal.length - 1 - endMatches, true ); + newChar = codePointAt( newVal, newVal.length - 1 - endMatches, true ); + if ( oldChar !== newChar ) { + break; + } + endMatches += oldChar.length; } inpParts = [ @@ -89,11 +107,15 @@ if ( fn ) { // stop, when there is nothing to slice - T43450 while ( $.byteLength( fn( inpParts.join( '' ) ) ) > byteLimit && inpParts[ 1 ].length > 0 ) { - inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -1 ); + // Do not chop off halves of surrogate pairs + chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1; + inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff ); } } else { while ( $.byteLength( inpParts.join( '' ) ) > byteLimit ) { - inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -1 ); + // Do not chop off halves of surrogate pairs + chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1; + inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff ); } } diff --git a/tests/qunit/suites/resources/jquery/jquery.byteLimit.test.js b/tests/qunit/suites/resources/jquery/jquery.byteLimit.test.js index 8555a7e4d8..1a660cfde1 100644 --- a/tests/qunit/suites/resources/jquery/jquery.byteLimit.test.js +++ b/tests/qunit/suites/resources/jquery/jquery.byteLimit.test.js @@ -1,5 +1,5 @@ ( function ( $, mw ) { - var simpleSample, U_20AC, mbSample; + var simpleSample, U_20AC, poop, mbSample; QUnit.module( 'jquery.byteLimit', QUnit.newMwEnvironment() ); @@ -9,6 +9,9 @@ // 3 bytes (euro-symbol) U_20AC = '\u20AC'; + // Outside of the BMP (pile of poo emoji) + poop = '\uD83D\uDCA9'; // "💩" + // Multi-byte sample (22 chars, 26 bytes) mbSample = '1234567890' + U_20AC + '1234567890' + U_20AC; @@ -109,6 +112,14 @@ expected: '1234567890' + U_20AC + '1' } ); + byteLimitTest( { + description: 'Limit using a custom value (multibyte, outside BMP)', + $input: $( '' ).attr( 'type', 'text' ) + .byteLimit( 3 ), + sample: poop, + expected: '' + } ); + byteLimitTest( { description: 'Limit using a custom value (multibyte) overlapping a byte', $input: $( '' ).attr( 'type', 'text' ) @@ -245,4 +256,33 @@ assert.strictEqual( $el.val(), 'abc', 'Trim from the insertion point (at 1), not the end' ); } ); + + QUnit.test( 'Do not cut up false matching substrings in emoji insertions', function ( assert ) { + var $el, + oldVal = '\uD83D\uDCA9\uD83D\uDCA9', // "💩💩" + newVal = '\uD83D\uDCA9\uD83D\uDCB9\uD83E\uDCA9\uD83D\uDCA9', // "💩💹🢩💩" + expected = '\uD83D\uDCA9\uD83D\uDCB9\uD83D\uDCA9'; // "💩💹💩" + + // Possible bad results: + // * With no surrogate support: + // '\uD83D\uDCA9\uD83D\uDCB9\uD83E\uDCA9' "💩💹🢩" + // * With correct trimming but bad detection of inserted text: + // '\uD83D\uDCA9\uD83D\uDCB9\uDCA9' "💩💹�" + + $el = $( '' ).attr( 'type', 'text' ) + .appendTo( '#qunit-fixture' ) + .byteLimit( 12 ) + .val( oldVal ).trigger( 'change' ) + .val( newVal ).trigger( 'change' ); + + assert.strictEqual( $el.val(), expected, 'Pasted emoji correctly trimmed at the end' ); + } ); + + byteLimitTest( { + description: 'Unpaired surrogates do not crash', + $input: $( '' ).attr( 'type', 'text' ).byteLimit( 4 ), + sample: '\uD800\uD800\uDFFF', + expected: '\uD800' + } ); + }( jQuery, mediaWiki ) ); -- 2.20.1