Add tracking category when editors use the deprecated self-closed tag hack.
authorC. Scott Ananian <cscott@cscott.net>
Wed, 4 May 2016 18:01:00 +0000 (14:01 -0400)
committerTim Starling <tstarling@wikimedia.org>
Tue, 12 Jul 2016 04:18:04 +0000 (14:18 +1000)
Some pages use constructs like `<b/>` or `<span/>` to protect spaces or
special characters at the beginning/end of templates.  This syntax is
incompatible with HTML5 parsing rules, which dictate that these should
be treated as open tags, and instead rely on an unusual quirk of the
`tidy` program that removes invalid constructs.

This syntax is deprecated as part of the process of reconciling `tidy`
with modern HTML5 parsing semantics.  Authors can use `&#32;` or `<nowiki/>`
as valid replacements.

In order to provide time to transition existing content, pages using
self-closing tags in violation of the HTML5 parsing specification
will have their templates/pages added to a new tracking category.
After these uses are fixed, we will change the sanitizer to treat these
as normal open tags, to be consistent with the HTML5 parsing spec.

Note that this construct is already disallowed if tidy is disabled; it
is rendered as `&lt;b/>`.  We add a tracking category in the no-tidy
case as well, in preparation for eventually making the no-tidy and
with-tidy behaviors consistent.

Bug: T134423
Change-Id: Ie1cf3aa40d5483bf395ece539f0240b694ff04ab

includes/Sanitizer.php
includes/parser/Parser.php
includes/specials/SpecialTrackingCategories.php
languages/i18n/en.json
languages/i18n/qqq.json

index d321e9f..c6606e2 100644 (file)
@@ -381,14 +381,17 @@ class Sanitizer {
                                'kbd', 'samp', 'data', 'time', 'mark'
                        ];
                        $htmlsingle = [
-                               'br', 'wbr', 'hr', 'li', 'dt', 'dd'
-                       ];
-                       $htmlsingleonly = [ # Elements that cannot have close tags
-                               'br', 'wbr', 'hr'
+                               'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
                        ];
 
-                       $htmlsingle[] = $htmlsingleonly[] = 'meta';
-                       $htmlsingle[] = $htmlsingleonly[] = 'link';
+                       # Elements that cannot have close tags. This is (not coincidentally)
+                       # also the list of tags for which the HTML 5 parsing algorithm
+                       # requires you to "acknowledge the token's self-closing flag", i.e.
+                       # a self-closing tag like <br/> is not an HTML 5 parse error only
+                       # for this list.
+                       $htmlsingleonly = [
+                               'br', 'wbr', 'hr', 'meta', 'link'
+                       ];
 
                        $htmlnest = [ # Tags that can be nested--??
                                'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
@@ -450,10 +453,14 @@ class Sanitizer {
         * @param array|bool $args Arguments for the processing callback
         * @param array $extratags For any extra tags to include
         * @param array $removetags For any tags (default or extra) to exclude
+        * @param callable $warnCallback (Deprecated) Callback allowing the
+        *   addition of a tracking category when bad input is encountered.
+        *   DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be
+        *   removed shortly.
         * @return string
         */
        public static function removeHTMLtags( $text, $processCallback = null,
-               $args = [], $extratags = [], $removetags = []
+               $args = [], $extratags = [], $removetags = [], $warnCallback = null
        ) {
                extract( self::getRecognizedTagData( $extratags, $removetags ) );
 
@@ -540,6 +547,14 @@ class Sanitizer {
                                                        $badtag = true;
                                                #  Is it a self closed htmlpair ? (bug 5487)
                                                } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
+                                                       // Eventually we'll just remove the self-closing
+                                                       // slash, in order to be consistent with HTML5
+                                                       // semantics.
+                                                       // $brace = '>';
+                                                       // For now, let's just warn authors to clean up.
+                                                       if ( is_callable( $warnCallback ) ) {
+                                                               call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
+                                                       }
                                                        $badtag = true;
                                                } elseif ( isset( $htmlsingleonly[$t] ) ) {
                                                        # Hack to force empty tag for unclosable elements
@@ -604,12 +619,29 @@ class Sanitizer {
                                                        call_user_func_array( $processCallback, [ &$params, $args ] );
                                                }
 
+                                               if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
+                                                       // Eventually we'll just remove the self-closing
+                                                       // slash, in order to be consistent with HTML5
+                                                       // semantics.
+                                                       // $brace = '>';
+                                                       // For now, let's just warn authors to clean up.
+                                                       if ( is_callable( $warnCallback ) ) {
+                                                               call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
+                                                       }
+                                               }
                                                if ( !Sanitizer::validateTag( $params, $t ) ) {
                                                        $badtag = true;
                                                }
 
                                                $newparams = Sanitizer::fixTagAttributes( $params, $t );
                                                if ( !$badtag ) {
+                                                       if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
+                                                               # Interpret self-closing tags as empty tags even when
+                                                               # HTML 5 would interpret them as start tags. Such input
+                                                               # is commonly seen on Wikimedia wikis with this intention.
+                                                               $brace = "></$t>";
+                                                       }
+
                                                        $rest = str_replace( '>', '&gt;', $rest );
                                                        $text .= "<$slash$t$newparams$brace$rest";
                                                        continue;
index bd2f131..19d68c2 100644 (file)
@@ -1264,7 +1264,9 @@ class Parser {
                        $text,
                        [ &$this, 'attributeStripCallback' ],
                        false,
-                       array_keys( $this->mTransparentTagHooks )
+                       array_keys( $this->mTransparentTagHooks ),
+                       [],
+                       [ &$this, 'addTrackingCategory' ]
                );
                Hooks::run( 'InternalParseBeforeLinks', [ &$this, &$text, &$this->mStripState ] );
 
index f2eb88d..4c892b2 100644 (file)
@@ -53,6 +53,7 @@ class SpecialTrackingCategories extends SpecialPage {
                'node-count-exceeded-category',
                'expansion-depth-exceeded-category',
                'restricted-displaytitle-ignored',
+               'deprecated-self-close-category',
        ];
 
        function execute( $par ) {
index 59e1ea4..6d1e58d 100644 (file)
        "content-model-json": "JSON",
        "content-json-empty-object": "Empty object",
        "content-json-empty-array": "Empty array",
+       "deprecated-self-close-category": "Pages using invalid self-closed HTML tags",
+       "deprecated-self-close-category-desc": "The page contains invalid self-closed HTML tags, such as <code>&lt;b/></code> or <code>&lt;span/></code>.  The behavior of these will change soon to be consistent with the HTML5 specification, so their use in wikitext is deprecated.",
        "duplicate-args-warning": "<strong>Warning:</strong> [[:$1]] is calling [[:$2]] with more than one value for the \"$3\" parameter. Only the last value provided will be used.",
        "duplicate-args-category": "Pages using duplicate arguments in template calls",
        "duplicate-args-category-desc": "The page contains template calls that use duplicates of arguments, such as <code><nowiki>{{foo|bar=1|bar=2}}</nowiki></code> or <code><nowiki>{{foo|bar|1=baz}}</nowiki></code>.",
index 1fa921e..c405ee5 100644 (file)
        "content-model-json": "Name for the JSON content model, used when decribing what type of content a page contains.\n\nThis message is substituted in:\n*{{msg-mw|Bad-target-model}}\n*{{msg-mw|Content-not-allowed-here}}",
        "content-json-empty-object": "Used to represent an object with no properties on a JSON content model page.",
        "content-json-empty-array": "Used to represent an array with no values on a JSON content model page.",
+       "deprecated-self-close-category": "This message is used as a category name for a [[mw:Special:MyLanguage/Help:Tracking categories|tracking category]] where pages are placed automatically if they contain invalid self-closed HTML tags, such as <code>&lt;b/></code> or <code>&lt;span/></code>.  The behavior of these will change soon to be consistent with the HTML5 specification, so their use in wikitext is deprecated.",
+       "deprecated-self-close-category-desc": "Invalid self-closed HTML tag category description. Shown on [[Special:TrackingCategories]].\n\nSee also:\n* {{msg-mw|deprecated-self-close-category}}",
        "duplicate-args-warning": "If a page calls a template and specifies the same argument more than once, such as <code><nowiki>{{foo|bar=1|bar=2}}</nowiki></code> or <code><nowiki>{{foo|bar|1=baz}}</nowiki></code>, this warning is displayed when previewing.\n\nParameters:\n* $1 - The calling page\n* $2 - The called template\n* $3 - The name of the duplicated argument",
        "duplicate-args-category": "This message is used as a category name for a [[mw:Special:MyLanguage/Help:Tracking categories|tracking category]] where pages are placed automatically if they contain template calls that use duplicates of arguments, such as <code><nowiki>{{foo|bar=1|bar=2}}</nowiki></code> or <code><nowiki>{{foo|bar|1=baz}}</nowiki></code>.",
        "duplicate-args-category-desc": "Duplicate arguments category description. Shown on [[Special:TrackingCategories]].\n\nSee also:\n* {{msg-mw|Duplicate-args-category}}",