From 6b8a5a137d3f449b4056e3de82fa6747b45f1f9a Mon Sep 17 00:00:00 2001 From: Fomafix Date: Sat, 25 Nov 2017 21:02:55 +0100 Subject: [PATCH] Strip soft hyphens (U+00AD) from title This change strips all soft hyphens from the title. This is already done for Unicode bidi characters (T5696). URLs with soft hyphens (%C2%AD) get redirected (301) to the URL without soft hyphens (T145605): https://de.wikipedia.org/wiki/Bosnatal%C2%ADbahn get redirected to https://de.wikipedia.org/wiki/Bosnatalbahn Links in wikitext containing soft hyphen "[[Bosnatalbahn]]" (the "" stands here for a soft hyphen) links "Bosnatalbahn" but displays "Bosnatalbahn". This change also allows to insert soft hyphens into the displaytitle (T66528). This allows to insert soft hyphens into the first heading for manual hyphenation of titles with very long words. This change prevents access to any existing articles containing soft hyphens in the title. After deploying this change a run of maintenance/cleanupTitles.php must performed to rename existing titles with soft hyphens. Before deploying this change existing articles and redirects with soft hyphens in the title can already renamed or deleted. Bug: T121979 Bug: T66528 Change-Id: Ie13626c433cdb460dbf00b3bba28d1bb5a7b6d6a --- RELEASE-NOTES-1.32 | 2 ++ includes/title/MediaWikiTitleCodec.php | 7 +++--- resources/src/mediawiki.Title/Title.js | 6 ++--- tests/parser/parserTests.txt | 24 +++++++++++++++++++ .../title/MediaWikiTitleCodecTest.php | 5 ++++ .../mediawiki/mediawiki.Title.test.js | 4 ++-- 6 files changed, 40 insertions(+), 8 deletions(-) diff --git a/RELEASE-NOTES-1.32 b/RELEASE-NOTES-1.32 index 9fd3161f1e..eca17564cf 100644 --- a/RELEASE-NOTES-1.32 +++ b/RELEASE-NOTES-1.32 @@ -94,6 +94,8 @@ because of Phabricator reports. deprecated. Use addModules() instead. === Other changes in 1.32 === +* Soft hyphens (U+00AD) are now automatically removed from titles; these + characters can accidentally end up in copy-and-pasted titles. * … == Compatibility == diff --git a/includes/title/MediaWikiTitleCodec.php b/includes/title/MediaWikiTitleCodec.php index 890a870a2a..655884b99a 100644 --- a/includes/title/MediaWikiTitleCodec.php +++ b/includes/title/MediaWikiTitleCodec.php @@ -275,10 +275,11 @@ class MediaWikiTitleCodec implements TitleFormatter, TitleParser { 'user_case_dbkey' => $dbkey, ]; - # Strip Unicode bidi override characters. + # Strip soft hyphens (U+00AD) and Unicode bidi override characters + # (U+200E, U+200F, U+202A. U+202B, U+202C, U+202D, U+202E). # Sometimes they slip into cut-n-pasted page titles, where the - # override chars get included in list displays. - $dbkey = preg_replace( '/\xE2\x80[\x8E\x8F\xAA-\xAE]/S', '', $dbkey ); + # soft hyphens or override chars get included in list displays. + $dbkey = preg_replace( '/\xC2\xAD|\xE2\x80[\x8E\x8F\xAA-\xAE]/S', '', $dbkey ); # Clean up whitespace # Note: use of the /u option on preg_replace here will cause diff --git a/resources/src/mediawiki.Title/Title.js b/resources/src/mediawiki.Title/Title.js index 2b76187359..b3542cd569 100644 --- a/resources/src/mediawiki.Title/Title.js +++ b/resources/src/mediawiki.Title/Title.js @@ -149,7 +149,7 @@ rWhitespace = /[ _\u00A0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+/g, // From MediaWikiTitleCodec::splitTitleString() in PHP - rUnicodeBidi = /[\u200E\u200F\u202A-\u202E]/g, + rStripCharacters = /[\u00AD\u200E\u200F\u202A-\u202E]/g, /** * Slightly modified from Flinfo. Credit goes to Lupo and Flominator. @@ -233,8 +233,8 @@ namespace = defaultNamespace === undefined ? NS_MAIN : defaultNamespace; title = title - // Strip Unicode bidi override characters - .replace( rUnicodeBidi, '' ) + // Strip soft hyphens and Unicode bidi override characters + .replace( rStripCharacters, '' ) // Normalise whitespace to underscores and remove duplicates .replace( rWhitespace, '_' ) // Trim underscores diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index 05afefacff..aa495b5347 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -30811,3 +30811,27 @@ header *foo footer !! end + +!! test +Check soft hyphens as entities (­) in displaytitle (T66528) +!! options +showtitle +title=[[Lopadotemachoselachogaleokranioleipsanodrimhypotrimmatosilphioparaomelitokatakechymenokichlepikossyphophattoperisteralektryonoptekephalliokigklopeleiolagoiosiraiobaphetraganopterygon]] +!! wikitext +{{DISPLAYTITLE:Lopado­temacho­selacho­galeo­kranio­leipsano­drim­hypo­trimmato­silphio­parao­melito­katakechy­meno­kichl­epi­kossypho­phatto­perister­alektryon­opte­kephallio­kigklo­peleio­lagoio­siraio­baphe­tragano­pterygon}} +!! html/php +Lopado­temacho­selacho­galeo­kranio­leipsano­drim­hypo­trimmato­silphio­parao­melito­katakechy­meno­kichl­epi­kossypho­phatto­perister­alektryon­opte­kephallio­kigklo­peleio­lagoio­siraio­baphe­tragano­pterygon + +!! end + +!! test +Check soft hyphens as Unicode characters (U+00AD) in displaytitle (T66528) +!! options +showtitle +title=[[Lopadotemachoselachogaleokranioleipsanodrimhypotrimmatosilphioparaomelitokatakechymenokichlepikossyphophattoperisteralektryonoptekephalliokigklopeleiolagoiosiraiobaphetraganopterygon]] +!! wikitext +{{DISPLAYTITLE:Lopado­temacho­selacho­galeo­kranio­leipsano­drim­hypo­trimmato­silphio­parao­melito­katakechy­meno­kichl­epi­kossypho­phatto­perister­alektryon­opte­kephallio­kigklo­peleio­lagoio­siraio­baphe­tragano­pterygon}} +!! html/php +Lopado­temacho­selacho­galeo­kranio­leipsano­drim­hypo­trimmato­silphio­parao­melito­katakechy­meno­kichl­epi­kossypho­phatto­perister­alektryon­opte­kephallio­kigklo­peleio­lagoio­siraio­baphe­tragano­pterygon + +!! end diff --git a/tests/phpunit/includes/title/MediaWikiTitleCodecTest.php b/tests/phpunit/includes/title/MediaWikiTitleCodecTest.php index e1b98ec341..70aa0710d6 100644 --- a/tests/phpunit/includes/title/MediaWikiTitleCodecTest.php +++ b/tests/phpunit/includes/title/MediaWikiTitleCodecTest.php @@ -104,6 +104,11 @@ class MediaWikiTitleCodecTest extends MediaWikiTestCase { // names ending in "a" to be female. [ NS_USER, 'Lisa_Müller', '', '', 'de', 'Benutzerin:Lisa Müller' ], [ NS_MAIN, 'FooBar', '', 'remotetestiw', 'en', 'remotetestiw:FooBar' ], + // Strip soft hyphen and Unicode bidi override characters + [ NS_MAIN, "Foo\xC2\xAD\xE2\x80\x8E\xE2\x80\x8F\xE2\x80\xAA\xE2\x80\xAB" . + "\xE2\x80\xAC\xE2\x80\xAD\xE2\x80\xAEbar", '', '', 'en', + "Foo\xC2\xAD\xE2\x80\x8E\xE2\x80\x8F\xE2\x80\xAA\xE2\x80\xAB" . + "\xE2\x80\xAC\xE2\x80\xAD\xE2\x80\xAEbar", 'Foobar' ], ]; } diff --git a/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js b/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js index d6fe744fc0..a775029709 100644 --- a/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js +++ b/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js @@ -245,8 +245,8 @@ title = new mw.Title( 'Foo \u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000 bar' ); assert.equal( title.getMain(), 'Foo_bar', 'Merge multiple types of whitespace/underscores into a single underscore' ); - title = new mw.Title( 'Foo\u200E\u200F\u202A\u202B\u202C\u202D\u202Ebar' ); - assert.equal( title.getMain(), 'Foobar', 'Strip Unicode bidi override characters' ); + title = new mw.Title( 'Foo\u00AD\u200E\u200F\u202A\u202B\u202C\u202D\u202Ebar' ); + assert.equal( title.getMain(), 'Foobar', 'Strip soft hyphen and Unicode bidi override characters' ); // Regression test: Previously it would only detect an extension if there is no space after it title = new mw.Title( 'Example.js ' ); -- 2.20.1