From 6cdae8051370b7e6641eaaeb438b889ddb510281 Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Wed, 4 May 2016 14:01:00 -0400 Subject: [PATCH] Add tracking category when editors use the deprecated self-closed tag hack. Some pages use constructs like `` or `` to protect spaces or special characters at the beginning/end of templates. This syntax is incompatible with HTML5 parsing rules, which dictate that these should be treated as open tags, and instead rely on an unusual quirk of the `tidy` program that removes invalid constructs. This syntax is deprecated as part of the process of reconciling `tidy` with modern HTML5 parsing semantics. Authors can use ` ` or `` as valid replacements. In order to provide time to transition existing content, pages using self-closing tags in violation of the HTML5 parsing specification will have their templates/pages added to a new tracking category. After these uses are fixed, we will change the sanitizer to treat these as normal open tags, to be consistent with the HTML5 parsing spec. Note that this construct is already disallowed if tidy is disabled; it is rendered as `<b/>`. We add a tracking category in the no-tidy case as well, in preparation for eventually making the no-tidy and with-tidy behaviors consistent. Bug: T134423 Change-Id: Ie1cf3aa40d5483bf395ece539f0240b694ff04ab --- includes/Sanitizer.php | 46 ++++++++++++++++--- includes/parser/Parser.php | 4 +- .../specials/SpecialTrackingCategories.php | 1 + languages/i18n/en.json | 2 + languages/i18n/qqq.json | 2 + 5 files changed, 47 insertions(+), 8 deletions(-) diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index d321e9f0c9..c6606e2995 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -381,14 +381,17 @@ class Sanitizer { 'kbd', 'samp', 'data', 'time', 'mark' ]; $htmlsingle = [ - 'br', 'wbr', 'hr', 'li', 'dt', 'dd' - ]; - $htmlsingleonly = [ # Elements that cannot have close tags - 'br', 'wbr', 'hr' + 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link' ]; - $htmlsingle[] = $htmlsingleonly[] = 'meta'; - $htmlsingle[] = $htmlsingleonly[] = 'link'; + # Elements that cannot have close tags. This is (not coincidentally) + # also the list of tags for which the HTML 5 parsing algorithm + # requires you to "acknowledge the token's self-closing flag", i.e. + # a self-closing tag like
is not an HTML 5 parse error only + # for this list. + $htmlsingleonly = [ + 'br', 'wbr', 'hr', 'meta', 'link' + ]; $htmlnest = [ # Tags that can be nested--?? 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', @@ -450,10 +453,14 @@ class Sanitizer { * @param array|bool $args Arguments for the processing callback * @param array $extratags For any extra tags to include * @param array $removetags For any tags (default or extra) to exclude + * @param callable $warnCallback (Deprecated) Callback allowing the + * addition of a tracking category when bad input is encountered. + * DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be + * removed shortly. * @return string */ public static function removeHTMLtags( $text, $processCallback = null, - $args = [], $extratags = [], $removetags = [] + $args = [], $extratags = [], $removetags = [], $warnCallback = null ) { extract( self::getRecognizedTagData( $extratags, $removetags ) ); @@ -540,6 +547,14 @@ class Sanitizer { $badtag = true; #  Is it a self closed htmlpair ? (bug 5487) } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) { + // Eventually we'll just remove the self-closing + // slash, in order to be consistent with HTML5 + // semantics. + // $brace = '>'; + // For now, let's just warn authors to clean up. + if ( is_callable( $warnCallback ) ) { + call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] ); + } $badtag = true; } elseif ( isset( $htmlsingleonly[$t] ) ) { # Hack to force empty tag for unclosable elements @@ -604,12 +619,29 @@ class Sanitizer { call_user_func_array( $processCallback, [ &$params, $args ] ); } + if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) { + // Eventually we'll just remove the self-closing + // slash, in order to be consistent with HTML5 + // semantics. + // $brace = '>'; + // For now, let's just warn authors to clean up. + if ( is_callable( $warnCallback ) ) { + call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] ); + } + } if ( !Sanitizer::validateTag( $params, $t ) ) { $badtag = true; } $newparams = Sanitizer::fixTagAttributes( $params, $t ); if ( !$badtag ) { + if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) { + # Interpret self-closing tags as empty tags even when + # HTML 5 would interpret them as start tags. Such input + # is commonly seen on Wikimedia wikis with this intention. + $brace = ">"; + } + $rest = str_replace( '>', '>', $rest ); $text .= "<$slash$t$newparams$brace$rest"; continue; diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index bd2f13181e..19d68c274f 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -1264,7 +1264,9 @@ class Parser { $text, [ &$this, 'attributeStripCallback' ], false, - array_keys( $this->mTransparentTagHooks ) + array_keys( $this->mTransparentTagHooks ), + [], + [ &$this, 'addTrackingCategory' ] ); Hooks::run( 'InternalParseBeforeLinks', [ &$this, &$text, &$this->mStripState ] ); diff --git a/includes/specials/SpecialTrackingCategories.php b/includes/specials/SpecialTrackingCategories.php index f2eb88d154..4c892b29ee 100644 --- a/includes/specials/SpecialTrackingCategories.php +++ b/includes/specials/SpecialTrackingCategories.php @@ -53,6 +53,7 @@ class SpecialTrackingCategories extends SpecialPage { 'node-count-exceeded-category', 'expansion-depth-exceeded-category', 'restricted-displaytitle-ignored', + 'deprecated-self-close-category', ]; function execute( $par ) { diff --git a/languages/i18n/en.json b/languages/i18n/en.json index 59e1ea498b..6d1e58d4c5 100644 --- a/languages/i18n/en.json +++ b/languages/i18n/en.json @@ -784,6 +784,8 @@ "content-model-json": "JSON", "content-json-empty-object": "Empty object", "content-json-empty-array": "Empty array", + "deprecated-self-close-category": "Pages using invalid self-closed HTML tags", + "deprecated-self-close-category-desc": "The page contains invalid self-closed HTML tags, such as <b/> or <span/>. The behavior of these will change soon to be consistent with the HTML5 specification, so their use in wikitext is deprecated.", "duplicate-args-warning": "Warning: [[:$1]] is calling [[:$2]] with more than one value for the \"$3\" parameter. Only the last value provided will be used.", "duplicate-args-category": "Pages using duplicate arguments in template calls", "duplicate-args-category-desc": "The page contains template calls that use duplicates of arguments, such as {{foo|bar=1|bar=2}} or {{foo|bar|1=baz}}.", diff --git a/languages/i18n/qqq.json b/languages/i18n/qqq.json index 1fa921eb50..c405ee5815 100644 --- a/languages/i18n/qqq.json +++ b/languages/i18n/qqq.json @@ -965,6 +965,8 @@ "content-model-json": "Name for the JSON content model, used when decribing what type of content a page contains.\n\nThis message is substituted in:\n*{{msg-mw|Bad-target-model}}\n*{{msg-mw|Content-not-allowed-here}}", "content-json-empty-object": "Used to represent an object with no properties on a JSON content model page.", "content-json-empty-array": "Used to represent an array with no values on a JSON content model page.", + "deprecated-self-close-category": "This message is used as a category name for a [[mw:Special:MyLanguage/Help:Tracking categories|tracking category]] where pages are placed automatically if they contain invalid self-closed HTML tags, such as <b/> or <span/>. The behavior of these will change soon to be consistent with the HTML5 specification, so their use in wikitext is deprecated.", + "deprecated-self-close-category-desc": "Invalid self-closed HTML tag category description. Shown on [[Special:TrackingCategories]].\n\nSee also:\n* {{msg-mw|deprecated-self-close-category}}", "duplicate-args-warning": "If a page calls a template and specifies the same argument more than once, such as {{foo|bar=1|bar=2}} or {{foo|bar|1=baz}}, this warning is displayed when previewing.\n\nParameters:\n* $1 - The calling page\n* $2 - The called template\n* $3 - The name of the duplicated argument", "duplicate-args-category": "This message is used as a category name for a [[mw:Special:MyLanguage/Help:Tracking categories|tracking category]] where pages are placed automatically if they contain template calls that use duplicates of arguments, such as {{foo|bar=1|bar=2}} or {{foo|bar|1=baz}}.", "duplicate-args-category-desc": "Duplicate arguments category description. Shown on [[Special:TrackingCategories]].\n\nSee also:\n* {{msg-mw|Duplicate-args-category}}", -- 2.20.1