From f87898b4885c2def58cab81e7481a5e96569f288 Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Mon, 26 Nov 2018 12:53:51 -0500 Subject: [PATCH] Protect legacy URL parameter syntax in link and alt options HTML doesn't allow certain semicolon-less HTML entities in attribute values to avoid breaking legacy markup like: ... (Note that the & in that URL is not properly entity-escaped as `&`.) Unlike wikitext, HTML generally allows semicolon-less legacy entities in text. Our alt and link option processing shove text through Sanitizer::stripAllTags, which does entity decoding including these legacy semicolon-less entities. Wikitext doesn't allow semicolon-less entities, so escape & characters where appropriate to protect alt/link options and avoid breaking URLs. This was a "regression" in how alt options were handled starting in ddb4913f53624c8ee0a2a91bd44bf750e378569d when we switched to using Remex for Sanitizer::stripAllTags -- semicolon-less entities (previously invalid in wikitext) were now being decoded when stripAllTags was called on alt text. This change became a problem when ad80f0bca27c2b0905b2b137977586bfab80db34 sent link option text through Sanitizer::stripAllTags (with the new semicolon-less entity decode) instead of PHP's strip_tags (which, in addition to its other faults, doesn't do entity decode at all). This suddenly started decoding "non-wikitext" entities like `¶` inside URLs, breaking links. Filed T210437 as a follow-up to consider changing the behavior of Sanitizer::stripAllTags() globally to prevent it from decoding semicolon-less entities for all callers. Bug: T209236 Change-Id: I5925e110e335d83eafa9de935c4e06806322f4a9 --- includes/parser/Parser.php | 34 ++++++++++++++++++++++++++++++++++ tests/parser/parserTests.txt | 31 +++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 11825fa443..353afdeed1 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -5524,6 +5524,40 @@ class Parser { # that are later expanded to html- so expand them now and # remove the tags $tooltip = $this->mStripState->unstripBoth( $tooltip ); + # Compatibility hack! In HTML certain entity references not terminated + # by a semicolon are decoded (but not if we're in an attribute; that's + # how link URLs get away without properly escaping & in queries). + # But wikitext has always required semicolon-termination of entities, + # so encode & where needed to avoid decode of semicolon-less entities. + # See T209236 and + # https://www.w3.org/TR/html5/syntax.html#named-character-references + # T210437 discusses moving this workaround to Sanitizer::stripAllTags. + $tooltip = preg_replace( "/ + & # 1. entity prefix + (?= # 2. followed by: + (?: # a. one of the legacy semicolon-less named entities + A(?:Elig|MP|acute|circ|grave|ring|tilde|uml)| + C(?:OPY|cedil)|E(?:TH|acute|circ|grave|uml)| + GT|I(?:acute|circ|grave|uml)|LT|Ntilde| + O(?:acute|circ|grave|slash|tilde|uml)|QUOT|REG|THORN| + U(?:acute|circ|grave|uml)|Yacute| + a(?:acute|c(?:irc|ute)|elig|grave|mp|ring|tilde|uml)|brvbar| + c(?:cedil|edil|urren)|cent(?!erdot;)|copy(?!sr;)|deg| + divide(?!ontimes;)|e(?:acute|circ|grave|th|uml)| + frac(?:1(?:2|4)|34)| + gt(?!c(?:c|ir)|dot|lPar|quest|r(?:a(?:pprox|rr)|dot|eq(?:less|qless)|less|sim);)| + i(?:acute|circ|excl|grave|quest|uml)|laquo| + lt(?!c(?:c|ir)|dot|hree|imes|larr|quest|r(?:Par|i(?:e|f|));)| + m(?:acr|i(?:cro|ddot))|n(?:bsp|tilde)| + not(?!in(?:E|dot|v(?:a|b|c)|)|ni(?:v(?:a|b|c)|);)| + o(?:acute|circ|grave|rd(?:f|m)|slash|tilde|uml)| + p(?:lusmn|ound)|para(?!llel;)|quot|r(?:aquo|eg)| + s(?:ect|hy|up(?:1|2|3)|zlig)|thorn|times(?!b(?:ar|)|d;)| + u(?:acute|circ|grave|ml|uml)|y(?:acute|en|uml) + ) + (?:[^;]|$)) # b. and not followed by a semicolon + # S = study, for efficiency + /Sx", '&', $tooltip ); $tooltip = Sanitizer::stripAllTags( $tooltip ); return $tooltip; diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index d2fbd8d302..bfa7de9fd3 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -15476,6 +15476,37 @@ File:Foobar.jpg|link=Foo''s_bar''s|caption !! end +!! test +HTML entity prefix in link markup (T209236) +!! wikitext +[[File:Foobar.jpg|link=https://example.com?foo¶ms=bar]] + + + +File:Foobar.jpg|link=https://example.com?foo¶ms=bar + +!! html/php+tidy +

Foobar.jpg +

+ +!! html/parsoid +

+ + + +!! end + !! test Image with table with attributes in caption !! options -- 2.20.1