From 71ec883cb379c0cda193715b7eccddca6f31a1d8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?= Date: Wed, 24 Aug 2016 21:33:45 +0200 Subject: [PATCH] mw.Title: Correct handling of Unicode whitespace and bidi control characters MediaWiki titles may not contain Unicode bidi control characters, e.g. U+200E LEFT-TO-RIGHT MARK. They are silently stripped and such a title is considered valid. MediaWiki titles may not contain any whitespace other than a regular space. Most of them are silently replaced with a regular space and such a title is considered valid, but there are some (e.g. the tab character) which make the title invalid. I'm not sure if this is an intentional behavior, but I added a test case to verify it. Bug: T143759 Change-Id: If8fad1f896027c5d93a62b0785923a39136c6a36 --- resources/src/mediawiki/mediawiki.Title.js | 29 +++++++------------ tests/phpunit/includes/TitleTest.php | 2 ++ .../mediawiki/mediawiki.Title.test.js | 10 +++++-- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/resources/src/mediawiki/mediawiki.Title.js b/resources/src/mediawiki/mediawiki.Title.js index 4c57faa6c8..e4687685eb 100644 --- a/resources/src/mediawiki/mediawiki.Title.js +++ b/resources/src/mediawiki/mediawiki.Title.js @@ -164,9 +164,12 @@ '|&#x[0-9A-Fa-f]+;' ), - // From MediaWikiTitleCodec.php#L225 @26fcab1f18c568a41 - // "Clean up whitespace" in function MediaWikiTitleCodec::splitTitleString() - rWhitespace = /[ _\u0009\u00A0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\s]+/g, + // From MediaWikiTitleCodec::splitTitleString() in PHP + // Note that this is not equivalent to /\s/, e.g. underscore is included, tab is not included. + rWhitespace = /[ _\u00A0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+/g, + + // From MediaWikiTitleCodec::splitTitleString() in PHP + rUnicodeBidi = /[\u200E\u200F\u202A-\u202E]/g, /** * Slightly modified from Flinfo. Credit goes to Lupo and Flominator. @@ -181,18 +184,6 @@ replace: '', generalRule: true }, - // Space, underscore, tab, NBSP and other unusual spaces - { - pattern: rWhitespace, - replace: ' ', - generalRule: true - }, - // unicode bidi override characters: Implicit, Embeds, Overrides - { - pattern: /[\u200E\u200F\u202A-\u202E]/g, - replace: '', - generalRule: true - }, // control characters { pattern: /[\x00-\x1f\x7f]/g, @@ -261,8 +252,10 @@ namespace = defaultNamespace === undefined ? NS_MAIN : defaultNamespace; title = title + // Strip Unicode bidi override characters + .replace( rUnicodeBidi, '' ) // Normalise whitespace to underscores and remove duplicates - .replace( /[ _\s]+/g, '_' ) + .replace( rWhitespace, '_' ) // Trim underscores .replace( rUnderscoreTrim, '' ); @@ -557,8 +550,8 @@ namespace = defaultNamespace === undefined ? NS_MAIN : defaultNamespace; - // Normalise whitespace and remove duplicates - title = $.trim( title.replace( rWhitespace, ' ' ) ); + // Normalise additional whitespace + title = $.trim( title.replace( /\s/g, ' ' ) ); // Process initial colon if ( title !== '' && title[ 0 ] === ':' ) { diff --git a/tests/phpunit/includes/TitleTest.php b/tests/phpunit/includes/TitleTest.php index 7850f2490c..7925c6f8f9 100644 --- a/tests/phpunit/includes/TitleTest.php +++ b/tests/phpunit/includes/TitleTest.php @@ -90,6 +90,8 @@ class TitleTest extends MediaWikiTestCase { [ 'A < B', 'title-invalid-characters' ], [ 'A > B', 'title-invalid-characters' ], [ 'A | B', 'title-invalid-characters' ], + [ "A \t B", 'title-invalid-characters' ], + [ "A \n B", 'title-invalid-characters' ], // URL encoding [ 'A%20B', 'title-invalid-characters' ], [ 'A%23B', 'title-invalid-characters' ], diff --git a/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js b/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js index 991725b8c6..886e2b6f04 100644 --- a/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js +++ b/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js @@ -38,6 +38,8 @@ 'A < B', 'A > B', 'A | B', + 'A \t B', + 'A \n B', // URL encoding 'A%20B', 'A%23B', @@ -222,7 +224,7 @@ assert.equal( title.getPrefixedText(), '.foo' ); } ); - QUnit.test( 'Transformation', 11, function ( assert ) { + QUnit.test( 'Transformation', 12, function ( assert ) { var title; title = new mw.Title( 'File:quux pif.jpg' ); @@ -242,10 +244,12 @@ assert.equal( title.toText(), 'User:HAshAr' ); assert.equal( title.getNamespaceId(), 2, 'Case-insensitive namespace prefix' ); - // Don't ask why, it's the way the backend works. One space is kept of each set. - title = new mw.Title( 'Foo __ \t __ bar' ); + title = new mw.Title( 'Foo \u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000 bar' ); assert.equal( title.getMain(), 'Foo_bar', 'Merge multiple types of whitespace/underscores into a single underscore' ); + title = new mw.Title( 'Foo\u200E\u200F\u202A\u202B\u202C\u202D\u202Ebar' ); + assert.equal( title.getMain(), 'Foobar', 'Strip Unicode bidi override characters' ); + // Regression test: Previously it would only detect an extension if there is no space after it title = new mw.Title( 'Example.js ' ); assert.equal( title.getExtension(), 'js', 'Space after an extension is stripped' ); -- 2.20.1