From 7564624d1ca80a4d2f1cc2b4d3d32d5d2e0bca38 Mon Sep 17 00:00:00 2001 From: Fomafix Date: Thu, 22 Mar 2018 11:48:55 +0100 Subject: [PATCH] Strip Unicode 6.3.0 directional formatting characters from title Unicode 6.3.0 (September 2013) the added additional directional formatting characters: U+061C ARABIC LETTER MARK U+2066 LEFT-TO-RIGHT ISOLATE U+2067 RIGHT-TO-LEFT ISOLATE U+2068 FIRST STRONG ISOLATE U+2069 POP DIRECTIONAL ISOLATE https://www.fileformat.info/info/unicode/version/6.3/index.htm This change strips the new directional formatting characters from the title like the directional formatting characters from Unicode 1.1.0 (June 1993). Any existing titles containing the new Unicode directional formatting characters get stripped by a run of maintenance/cleanupTitles.php after deployment. This change also allows to insert the new Unicode directional formatting characters into the DISPLAYTITLE. Change-Id: I2279f51048f5252c2e4280ec6a13f060ff9967cb --- RELEASE-NOTES-1.32 | 2 ++ includes/title/MediaWikiTitleCodec.php | 10 +++++++--- resources/src/mediawiki.Title/Title.js | 4 ++-- tests/parser/parserTests.txt | 2 +- .../includes/title/MediaWikiTitleCodecTest.php | 12 +++++++----- .../resources/mediawiki/mediawiki.Title.test.js | 6 +++--- 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/RELEASE-NOTES-1.32 b/RELEASE-NOTES-1.32 index eca17564cf..2124db9ff2 100644 --- a/RELEASE-NOTES-1.32 +++ b/RELEASE-NOTES-1.32 @@ -96,6 +96,8 @@ because of Phabricator reports. === Other changes in 1.32 === * Soft hyphens (U+00AD) are now automatically removed from titles; these characters can accidentally end up in copy-and-pasted titles. +* Strip Unicode 6.3.0 directional formatting characters (U+061C, U+2066, + U+2067, U+2068, U+2069) from the title. * … == Compatibility == diff --git a/includes/title/MediaWikiTitleCodec.php b/includes/title/MediaWikiTitleCodec.php index 655884b99a..7c2d393516 100644 --- a/includes/title/MediaWikiTitleCodec.php +++ b/includes/title/MediaWikiTitleCodec.php @@ -275,11 +275,15 @@ class MediaWikiTitleCodec implements TitleFormatter, TitleParser { 'user_case_dbkey' => $dbkey, ]; - # Strip soft hyphens (U+00AD) and Unicode bidi override characters - # (U+200E, U+200F, U+202A. U+202B, U+202C, U+202D, U+202E). + # Strip soft hyphens (U+00AD) and Unicode directional formatting characters (U+061C, U+200E, + # U+200F, U+202A. U+202B, U+202C, U+202D, U+202E, U+2066, U+2067, U+2068, U+2069). # Sometimes they slip into cut-n-pasted page titles, where the # soft hyphens or override chars get included in list displays. - $dbkey = preg_replace( '/\xC2\xAD|\xE2\x80[\x8E\x8F\xAA-\xAE]/S', '', $dbkey ); + $dbkey = preg_replace( + '/\xC2\xAD|\xD8\x9C|\xE2\x80[\x8E\x8F\xAA-\xAE]|\xE2\x81[\xA6-\xA9]/S', + '', + $dbkey + ); # Clean up whitespace # Note: use of the /u option on preg_replace here will cause diff --git a/resources/src/mediawiki.Title/Title.js b/resources/src/mediawiki.Title/Title.js index b3542cd569..dcaae3e3de 100644 --- a/resources/src/mediawiki.Title/Title.js +++ b/resources/src/mediawiki.Title/Title.js @@ -149,7 +149,7 @@ rWhitespace = /[ _\u00A0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+/g, // From MediaWikiTitleCodec::splitTitleString() in PHP - rStripCharacters = /[\u00AD\u200E\u200F\u202A-\u202E]/g, + rStripCharacters = /[\u00AD\u061C\u200E\u200F\u202A-\u202E\u2066-\u2069]/g, /** * Slightly modified from Flinfo. Credit goes to Lupo and Flominator. @@ -233,7 +233,7 @@ namespace = defaultNamespace === undefined ? NS_MAIN : defaultNamespace; title = title - // Strip soft hyphens and Unicode bidi override characters + // Strip soft hyphens and Unicode directional formatting characters .replace( rStripCharacters, '' ) // Normalise whitespace to underscores and remove duplicates .replace( rWhitespace, '_' ) diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index aa495b5347..d17fbbe31a 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -28805,7 +28805,7 @@ foo {{echo|bar [[Category:baz]]}} bar # of the categories in wikitext # Do not remove these characters in edits. # -# As part of the serialization, these bidi characters will get stripped. +# As part of the serialization, these Unicode directional formatting characters will get stripped. !! test RTL (\u200f) and LTR (\u200e) markers around category tags should be stripped !! options diff --git a/tests/phpunit/includes/title/MediaWikiTitleCodecTest.php b/tests/phpunit/includes/title/MediaWikiTitleCodecTest.php index 70aa0710d6..c2725516b6 100644 --- a/tests/phpunit/includes/title/MediaWikiTitleCodecTest.php +++ b/tests/phpunit/includes/title/MediaWikiTitleCodecTest.php @@ -104,11 +104,13 @@ class MediaWikiTitleCodecTest extends MediaWikiTestCase { // names ending in "a" to be female. [ NS_USER, 'Lisa_Müller', '', '', 'de', 'Benutzerin:Lisa Müller' ], [ NS_MAIN, 'FooBar', '', 'remotetestiw', 'en', 'remotetestiw:FooBar' ], - // Strip soft hyphen and Unicode bidi override characters - [ NS_MAIN, "Foo\xC2\xAD\xE2\x80\x8E\xE2\x80\x8F\xE2\x80\xAA\xE2\x80\xAB" . - "\xE2\x80\xAC\xE2\x80\xAD\xE2\x80\xAEbar", '', '', 'en', - "Foo\xC2\xAD\xE2\x80\x8E\xE2\x80\x8F\xE2\x80\xAA\xE2\x80\xAB" . - "\xE2\x80\xAC\xE2\x80\xAD\xE2\x80\xAEbar", 'Foobar' ], + // Strip soft hyphen and Unicode directional formatting characters + [ NS_MAIN, "Foo\xC2\xAD\xD8\x9C\xE2\x80\x8E\xE2\x80\x8F\xE2\x80\xAA\xE2\x80\xAB" . + "\xE2\x80\xAC\xE2\x80\xAD\xE2\x80\xAE\xE2\x81\xA6\xE2\x81\xA7" . + "\xE2\x81\xA8\xE2\x81\xA9bar", '', '', 'en', + "Foo\xC2\xAD\xD8\x9C\xE2\x80\x8E\xE2\x80\x8F\xE2\x80\xAA\xE2\x80\xAB" . + "\xE2\x80\xAC\xE2\x80\xAD\xE2\x80\xAE\xE2\x81\xA6\xE2\x81\xA7" . + "\xE2\x81\xA8\xE2\x81\xA9bar", 'Foobar' ], ]; } diff --git a/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js b/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js index a775029709..e8db4e1d22 100644 --- a/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js +++ b/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js @@ -245,8 +245,8 @@ title = new mw.Title( 'Foo \u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000 bar' ); assert.equal( title.getMain(), 'Foo_bar', 'Merge multiple types of whitespace/underscores into a single underscore' ); - title = new mw.Title( 'Foo\u00AD\u200E\u200F\u202A\u202B\u202C\u202D\u202Ebar' ); - assert.equal( title.getMain(), 'Foobar', 'Strip soft hyphen and Unicode bidi override characters' ); + title = new mw.Title( 'Foo\u00AD\u061C\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069bar' ); + assert.equal( title.getMain(), 'Foobar', 'Strip soft hyphen and Unicode directional formatting characters' ); // Regression test: Previously it would only detect an extension if there is no space after it title = new mw.Title( 'Example.js ' ); @@ -668,7 +668,7 @@ }, { fileName: 'BI\u200EDI.jpg', - typeOfName: 'Name containing BIDI overrides', + typeOfName: 'Name containing Unicode directional formatting characters', nameText: 'BIDI', prefixedText: 'File:BIDI.jpg' }, -- 2.20.1