From b468faa362dff6b0ad6dd6046082a890b9580f30 Mon Sep 17 00:00:00 2001 From: Ed Sanders Date: Wed, 21 Jun 2017 21:43:36 -0700 Subject: [PATCH] Sanitizer: Allow attribute names to use any Unicode "Letter" or "Number" Bug: T73386 Change-Id: If712841ba56c5d8f30bbbad500403446a165b07c --- includes/Sanitizer.php | 6 +++--- tests/phpunit/includes/SanitizerTest.php | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 8920e92f43..01fe60e0d1 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -339,8 +339,8 @@ class Sanitizer { */ static function getAttribsRegex() { if ( self::$attribsRegex === null ) { - $attribFirst = '[:A-Z_a-z0-9]'; - $attrib = '[:A-Z_a-z-.0-9]'; + $attribFirst = "[:_\p{L}\p{N}]"; + $attrib = "[:_\.\-\p{L}\p{N}]"; $space = '[\x09\x0a\x0c\x0d\x20]'; self::$attribsRegex = "/(?:^|$space)({$attribFirst}{$attrib}*) @@ -351,7 +351,7 @@ class Sanitizer { | '([^']*)(?:'|\$) | (((?!$space|>).)*) ) - )?(?=$space|\$)/sx"; + )?(?=$space|\$)/sxu"; } return self::$attribsRegex; } diff --git a/tests/phpunit/includes/SanitizerTest.php b/tests/phpunit/includes/SanitizerTest.php index c237c509a7..abcf1d4283 100644 --- a/tests/phpunit/includes/SanitizerTest.php +++ b/tests/phpunit/includes/SanitizerTest.php @@ -178,6 +178,10 @@ class SanitizerTest extends MediaWikiTestCase { public static function provideTagAttributesToDecode() { return [ [ [ 'foo' => 'bar' ], 'foo=bar', 'Unquoted attribute' ], + [ [ 'עברית' => 'bar' ], 'עברית=bar', 'Non-Latin attribute' ], + [ [ '६' => 'bar' ], '६=bar', 'Devanagari number' ], + [ [ '搭𨋢' => 'bar' ], '搭𨋢=bar', 'Non-BMP character' ], + [ [], 'ńgh=bar', 'Combining accent is not allowed' ], [ [ 'foo' => 'bar' ], ' foo = bar ', 'Spaced attribute' ], [ [ 'foo' => 'bar' ], 'foo="bar"', 'Double-quoted attribute' ], [ [ 'foo' => 'bar' ], 'foo=\'bar\'', 'Single-quoted attribute' ], -- 2.20.1