From b975a0bfe07d5f9d9e0ff6bd5b6f70e6cc4fe678 Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Thu, 11 Dec 2014 15:15:28 -0500 Subject: [PATCH] Don't break autolinks by stripping the final semicolon from an entity. Autolinking free external links is clever about making sure that trailing punctuation isn't included in the link. But if an HTML entity happens to terminate the URL, the semicolon from the entity is stripped from the url, breaking it. Fix this corner case. This also unifies autolink parsing with Parsoid. See: I5ae8435322c78dd1df170d7a3543fff3642759b1 Change-Id: I5482782c25e12283030b0fd2150ac55092f7979b --- includes/parser/Parser.php | 15 ++++++++++++++- tests/parser/parserTests.txt | 14 ++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index a9daa22309..ecb14ed85b 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -1484,7 +1484,20 @@ class Parser { $sep .= ')'; } - $numSepChars = strspn( strrev( $url ), $sep ); + $urlRev = strrev( $url ); + $numSepChars = strspn( $urlRev, $sep ); + # Don't break a trailing HTML entity by moving the ; into $trail + # This is in hot code, so use substr_compare to avoid having to + # create a new string object for the comparison + if ( $numSepChars && substr_compare( $url, ";", -$numSepChars, 1 ) === 0) { + # more optimization: instead of running preg_match with a $ + # anchor, which can be slow, do the match on the reversed + # string starting at the desired offset. + # un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i + if ( preg_match( '/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars ) ) { + $numSepChars--; + } + } if ( $numSepChars ) { $trail = substr( $url, -$numSepChars ) . $trail; $url = substr( $url, 0, -$numSepChars ); diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index c7fc380adb..63f6a759ef 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -4171,6 +4171,13 @@ http://example.com! http://example.com? http://example.com) http://example.com/url_with_(brackets) +(http://example.com/url_without_brackets) +http://example.com/url_with_entity  +http://example.com/url_with_entity  +http://example.com/url_with_entity  +http://example.com/url_with_entity< +http://example.com/url_with_entity< +http://example.com/url_with_entity< !! html

http://example.com, http://example.com; @@ -4181,6 +4188,13 @@ http://example.com/url_with_(brackets) http://example.com? http://example.com) http://example.com/url_with_(brackets) +(http://example.com/url_without_brackets) +http://example.com/url_with_entity  +http://example.com/url_with_entity  +http://example.com/url_with_entity  +http://example.com/url_with_entity< +http://example.com/url_with_entity%3C +http://example.com/url_with_entity%3C

!! end -- 2.20.1