From fd6e9ef2d481209b01fa6e1bb1c863b8257f0272 Mon Sep 17 00:00:00 2001 From: Max Semenik Date: Thu, 29 Jun 2017 17:13:12 -0700 Subject: [PATCH] Human-readable section ID support It adds the ability to replace the current section ID escaping schema (.C0.DE) with a HTML5-compliant escaping schema that is displayed as Unicode in many modern browsers. See the linked bug for discussion of various options that were considered before the implementation. A few remarks: * Because Sanitizer::escapeId() is used in a bunch of places without escaping, I'm deprecating it without altering its behavior. * The bug described in comments for Parser::guessLegacySectionNameFromWikiText() is still there in some Edge versions that display mojibake. Bug: T152540 Change-Id: Id304010a0342efbb7ef2d56c5b8b244f2e4fb2c5 --- RELEASE-NOTES-1.30 | 15 ++ autoload.php | 1 + includes/DefaultSettings.php | 52 +++++- includes/EditPage.php | 31 +++- includes/Linker.php | 14 +- includes/Sanitizer.php | 162 ++++++++++++++++-- includes/Setup.php | 5 + includes/Title.php | 8 +- includes/actions/InfoAction.php | 4 +- includes/api/ApiMain.php | 33 ++-- includes/htmlform/HTMLForm.php | 4 +- includes/htmlform/HTMLFormField.php | 6 +- includes/htmlform/OOUIHTMLForm.php | 2 +- .../htmlform/fields/HTMLFormFieldCloner.php | 8 +- includes/htmlform/fields/HTMLRadioField.php | 2 +- includes/page/ImagePage.php | 11 +- includes/parser/Parser.php | 76 ++++---- .../ResourceLoaderMediaWikiUtilModule.php | 46 +++++ includes/skins/BaseTemplate.php | 6 +- includes/skins/Skin.php | 4 +- includes/specials/SpecialListgrants.php | 2 +- includes/specials/SpecialListgrouprights.php | 2 +- includes/specials/SpecialVersion.php | 2 +- .../specials/pagers/AllMessagesTablePager.php | 4 +- resources/Resources.php | 1 + .../mediawiki.action.edit.preview.js | 2 +- resources/src/mediawiki/mediawiki.util.js | 101 +++++++++-- tests/parser/ParserTestRunner.php | 2 + tests/parser/parserTests.txt | 141 +++++++++++++++ tests/phpunit/includes/SanitizerTest.php | 123 ++++++++++++- .../mediawiki/mediawiki.util.test.js | 71 +++++++- 31 files changed, 811 insertions(+), 130 deletions(-) create mode 100644 includes/resourceloader/ResourceLoaderMediaWikiUtilModule.php diff --git a/RELEASE-NOTES-1.30 b/RELEASE-NOTES-1.30 index c5ab81afb9..452cb35b2c 100644 --- a/RELEASE-NOTES-1.30 +++ b/RELEASE-NOTES-1.30 @@ -26,6 +26,13 @@ section). array. This allows dependency injection to be used for ResourceLoader modules. * $wgExceptionHooks has been removed. * (T45547) $wgUsePigLatinVariant added (off by default). +* (T152540) MediaWiki now supports a section ID escaping style that allows to display + non-Latin characters verbatim on many modern browsers. This is controlled by the + new configuration setting, $wgFragmentMode. +* $wgExperimentalHtmlIds is now deprecated and will be removed in a future version, + use $wgFragmentMode to migrate off it to a modern alternative. +* $wgExternalInterwikiFragmentMode was introduced to control how fragments in + sinterwikis going outside of current wiki farm are encoded. === New features in 1.30 === * (T37247) Output from Parser::parse() will now be wrapped in a div with @@ -143,6 +150,14 @@ changes to languages because of Phabricator reports. MediaWikiServices instead. Access to the underlying BagOStuff is possible through the new ParserCache::getCacheStorage() method. * .mw-ui-constructive CSS class (deprecated in 1.27) was removed. +* Sanitizer::escapeId() was deprecated, use escapeIdForAttribute(), + escapeIdForLink() or escapeIdForExternalInterwiki() instead. +* Title::escapeFragmentForURL() was deprecated, use one of the aforementioned + Sanitizer functions or, if possible, Title::getFragmentForURL(). +* Second parameter to Sanitizer::escapeIdReferenceList() ($options) now does + nothing and is deprecated. +* mw.util.escapeId() was deprecated, use escapeIdForAttribute() or + escapeIdForLink(). == Compatibility == MediaWiki 1.30 requires PHP 5.5.9 or later. There is experimental support for diff --git a/autoload.php b/autoload.php index 2bf1d4cc5d..d44a30556b 100644 --- a/autoload.php +++ b/autoload.php @@ -1231,6 +1231,7 @@ $wgAutoloadLocalClasses = [ 'ResourceLoaderJqueryMsgModule' => __DIR__ . '/includes/resourceloader/ResourceLoaderJqueryMsgModule.php', 'ResourceLoaderLanguageDataModule' => __DIR__ . '/includes/resourceloader/ResourceLoaderLanguageDataModule.php', 'ResourceLoaderLanguageNamesModule' => __DIR__ . '/includes/resourceloader/ResourceLoaderLanguageNamesModule.php', + 'ResourceLoaderMediaWikiUtilModule' => __DIR__ . '/includes/resourceloader/ResourceLoaderMediaWikiUtilModule.php', 'ResourceLoaderModule' => __DIR__ . '/includes/resourceloader/ResourceLoaderModule.php', 'ResourceLoaderOOUIFileModule' => __DIR__ . '/includes/resourceloader/ResourceLoaderOOUIFileModule.php', 'ResourceLoaderOOUIImageModule' => __DIR__ . '/includes/resourceloader/ResourceLoaderOOUIImageModule.php', diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index b6d75ce734..8e38121b1c 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -3372,16 +3372,56 @@ $wgApiFrameOptions = 'DENY'; $wgDisableOutputCompression = false; /** - * Should we allow a broader set of characters in id attributes, per HTML5? If - * not, use only HTML 4-compatible IDs. This option is for testing -- when the - * functionality is ready, it will be on by default with no option. + * Abandoned experiment with HTML5-style ID escaping. Normalized IDs a bit + * too aggressively, breaking preexisting content (particularly Cite). + * See T29733, T29694, T29474. * - * Currently this appears to work fine in all browsers, but it's disabled by - * default because it normalizes id's a bit too aggressively, breaking preexisting - * content (particularly Cite). See T29733, T29694, T29474. + * @deprecated since 1.30, use $wgFragmentMode */ $wgExperimentalHtmlIds = false; +/** + * How should section IDs be encoded? + * This array can contain 1 or 2 elements, each of them can be one of: + * - 'html5' is modern HTML5 style encoding with minimal escaping. Allows to + * display Unicode characters in many browsers' address bars. + * - 'legacy' is old MediaWiki-style encoding, e.g. 啤酒 turns into .E5.95.A4.E9.85.92 + * - 'html5-legacy' corresponds to DEPRECATED $wgExperimentalHtmlIds mode. DO NOT use + * it for anything but migration off that mode (see below). + * + * The first element of this array specifies the primary mode of escaping IDs. This + * is what users will see when they e.g. follow an [[#internal link]] to a section of + * a page. + * + * The optional second element defines a fallback mode, useful for migrations. + * If present, it will direct MediaWiki to add empty s to every section with its + * id attribute set to fallback encoded title so that links using the previous encoding + * would still work. + * + * Example: you want to migrate your wiki from 'legacy' to 'html5' + * + * On the first step, set this variable to [ 'legacy', 'html5' ]. After a while, when + * all caches (parser, HTTP, etc.) contain only pages generated with this setting, + * flip the value to [ 'html5', 'legacy' ]. This will result in all internal links being + * generated in the new encoding while old links (both external and cached internal) will + * still work. After a long time, you might want to ditch backwards compatibility and + * set it to [ 'html5' ]. After all, pages get edited, breaking incoming links no matter which + * fragment mode is used. + * + * @since 1.30 + */ +$wgFragmentMode = [ 'legacy' ]; + +/** + * Which ID escaping mode should be used for external interwiki links? See documentation + * for $wgFragmentMode above for details of each mode. Because you can't control external sites, + * this setting should probably always be 'legacy', unless every wiki you link to has converted + * to 'html5'. + * + * @since 1.30 + */ +$wgExternalInterwikiFragmentMode = 'legacy'; + /** * Abstract list of footer icons for skins in place of old copyrightico and poweredbyico code * You can add new icons to the built in copyright or poweredby, or you can create diff --git a/includes/EditPage.php b/includes/EditPage.php index fc770068f5..9d83fbd91d 100644 --- a/includes/EditPage.php +++ b/includes/EditPage.php @@ -1698,7 +1698,7 @@ class EditPage { global $wgParser; if ( $this->sectiontitle !== '' ) { - $sectionanchor = $wgParser->guessLegacySectionNameFromWikiText( $this->sectiontitle ); + $sectionanchor = $this->guessSectionName( $this->sectiontitle ); // If no edit summary was specified, create one automatically from the section // title and have it link to the new section. Otherwise, respect the summary as // passed. @@ -1708,7 +1708,7 @@ class EditPage { ->rawParams( $cleanSectionTitle )->inContentLanguage()->text(); } } elseif ( $this->summary !== '' ) { - $sectionanchor = $wgParser->guessLegacySectionNameFromWikiText( $this->summary ); + $sectionanchor = $this->guessSectionName( $this->summary ); # This is a new section, so create a link to the new section # in the revision summary. $cleanSummary = $wgParser->stripSectionName( $this->summary ); @@ -1743,7 +1743,7 @@ class EditPage { * time. */ public function internalAttemptSave( &$result, $bot = false ) { - global $wgUser, $wgRequest, $wgParser, $wgMaxArticleSize; + global $wgUser, $wgRequest, $wgMaxArticleSize; global $wgContentHandlerUseDB; $status = Status::newGood(); @@ -2117,7 +2117,7 @@ class EditPage { # We can't deal with anchors, includes, html etc in the header for now, # headline would need to be parsed to improve this. if ( $hasmatch && strlen( $matches[2] ) > 0 ) { - $sectionanchor = $wgParser->guessLegacySectionNameFromWikiText( $matches[2] ); + $sectionanchor = $this->guessSectionName( $matches[2] ); } } $result['sectionanchor'] = $sectionanchor; @@ -4795,4 +4795,27 @@ HTML } return $wikitext; } + + /** + * Turns section name wikitext into anchors for use in HTTP redirects. Various + * versions of Microsoft browsers misinterpret fragment encoding of Location: headers + * resulting in mojibake in address bar. Redirect them to legacy section IDs, + * if possible. All the other browsers get HTML5 if the wiki is configured for it, to + * spread the new style links more efficiently. + * + * @param string $text + * @return string + */ + private function guessSectionName( $text ) { + global $wgParser; + + // Detect Microsoft browsers + $userAgent = $this->context->getRequest()->getHeader( 'User-Agent' ); + if ( $userAgent && preg_match( '/MSIE|Edge/', $userAgent ) ) { + // ...and redirect them to legacy encoding, if available + return $wgParser->guessLegacySectionNameFromWikiText( $text ); + } + // Meanwhile, real browsers get real anchors + return $wgParser->guessSectionNameFromWikiText( $text ); + } } diff --git a/includes/Linker.php b/includes/Linker.php index 4aae3ba628..2ca851c13f 100644 --- a/includes/Linker.php +++ b/includes/Linker.php @@ -1608,22 +1608,24 @@ class Linker { * a space and ending with '>' * This *must* be at least '>' for no attribs * @param string $anchor The anchor to give the headline (the bit after the #) - * @param string $html Html for the text of the header + * @param string $html HTML for the text of the header * @param string $link HTML to add for the section edit link - * @param bool|string $legacyAnchor A second, optional anchor to give for + * @param string|bool $fallbackAnchor A second, optional anchor to give for * backward compatibility (false to omit) * * @return string HTML headline */ public static function makeHeadline( $level, $attribs, $anchor, $html, - $link, $legacyAnchor = false + $link, $fallbackAnchor = false ) { + $anchorEscaped = htmlspecialchars( $anchor ); $ret = "$html" + . "$html" . $link . ""; - if ( $legacyAnchor !== false ) { - $ret = "
$ret"; + if ( $fallbackAnchor !== false && $fallbackAnchor !== $anchor ) { + $fallbackAnchor = htmlspecialchars( $fallbackAnchor ); + $ret = "
$ret"; } return $ret; } diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 2def06a9dd..907da16054 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -56,6 +56,21 @@ class Sanitizer { const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; + /** + * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding. + * + * @since 1.30 + */ + const ID_PRIMARY = 0; + + /** + * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false + * if no fallback is configured. + * + * @since 1.30 + */ + const ID_FALLBACK = 1; + /** * List of all named character entities defined in HTML 4.01 * https://www.w3.org/TR/html4/sgml/entities.html @@ -800,7 +815,7 @@ class Sanitizer { # Escape HTML id attributes if ( $attribute === 'id' ) { - $value = self::escapeId( $value, 'noninitial' ); + $value = self::escapeIdForAttribute( $value, Sanitizer::ID_PRIMARY ); } # Escape HTML id reference lists @@ -1164,6 +1179,8 @@ class Sanitizer { * ambiguous if it's part of something that looks like a percent escape * (which don't work reliably in fragments cross-browser). * + * @deprecated since 1.30, use one of this class' escapeIdFor*() functions + * * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters * in the id and name attributes * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with @@ -1215,21 +1232,146 @@ class Sanitizer { return $id; } + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid HTML id attribute. + * + * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, + * be sure to use proper escaping. + * + * @param string $id String to escape + * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding + * should be used. + * @return string|bool Escaped ID or false if fallback encoding is requested but it's not + * configured. + * + * @since 1.30 + */ + public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) { + global $wgFragmentMode; + + if ( !isset( $wgFragmentMode[$mode] ) ) { + if ( $mode === self::ID_PRIMARY ) { + throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); + } + return false; + } + + $internalMode = $wgFragmentMode[$mode]; + + return self::escapeIdInternal( $id, $internalMode ); + } + + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid URL fragment. + * + * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe, + * be sure to use proper escaping. + * + * @param string $id String to escape + * @return string Escaped ID + * + * @since 1.30 + */ + public static function escapeIdForLink( $id ) { + global $wgFragmentMode; + + if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) { + throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' ); + } + + $mode = $wgFragmentMode[self::ID_PRIMARY]; + + $id = self::escapeIdInternal( $id, $mode ); + $id = self::urlEscapeId( $id, $mode ); + + return $id; + } + + /** + * Given a section name or other user-generated or otherwise unsafe string, escapes it to be + * a valid URL fragment for external interwikis. + * + * @param string $id String to escape + * @return string Escaped ID + * + * @since 1.30 + */ + public static function escapeIdForExternalInterwiki( $id ) { + global $wgExternalInterwikiFragmentMode; + + $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode ); + $id = self::urlEscapeId( $id, $wgExternalInterwikiFragmentMode ); + + return $id; + } + + /** + * Helper for escapeIdFor*() functions. URL-escapes the ID if needed. + * + * @param string $id String to escape + * @param string $mode One of modes from $wgFragmentMode + * @return string + */ + private static function urlEscapeId( $id, $mode ) { + if ( $mode === 'html5' ) { + $id = urlencode( $id ); + $id = str_replace( '%3A', ':', $id ); + } + + return $id; + } + + /** + * Helper for escapeIdFor*() functions. Performs most of the actual escaping. + * + * @param string $id String to escape + * @param string $mode One of modes from $wgFragmentMode + * @return string + */ + private static function escapeIdInternal( $id, $mode ) { + $id = Sanitizer::decodeCharReferences( $id ); + + switch ( $mode ) { + case 'html5': + $id = str_replace( ' ', '_', $id ); + break; + case 'legacy': + // This corresponds to 'noninitial' mode of the old escapeId() + static $replace = [ + '%3A' => ':', + '%' => '.' + ]; + + $id = urlencode( str_replace( ' ', '_', $id ) ); + $id = strtr( $id, $replace ); + break; + case 'html5-legacy': + $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); + $id = trim( $id, '_' ); + if ( $id === '' ) { + // Must have been all whitespace to start with. + $id = '_'; + } + break; + default: + throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ ); + } + + return $id; + } + /** * Given a string containing a space delimited list of ids, escape each id * to match ids escaped by the escapeId() function. * + * @todo wfDeprecated() uses of $options in 1.31, remove completely in 1.32 + * * @since 1.27 * * @param string $referenceString Space delimited list of ids - * @param string|array $options String or array of strings (default is array()): - * 'noninitial': This is a non-initial fragment of an id, not a full id, - * so don't pay attention if the first character isn't valid at the - * beginning of an id. Only matters if $wgExperimentalHtmlIds is - * false. - * 'legacy': Behave the way the old HTML 4-based ID escaping worked even - * if $wgExperimentalHtmlIds is used, so we can generate extra - * anchors and links won't break. + * @param string|array $options Deprecated and does nothing. * @return string */ static function escapeIdReferenceList( $referenceString, $options = [] ) { @@ -1238,7 +1380,7 @@ class Sanitizer { # Escape each token as an id foreach ( $references as &$ref ) { - $ref = self::escapeId( $ref, $options ); + $ref = self::escapeIdForAttribute( $ref ); } # Merge the array back to a space delimited list string diff --git a/includes/Setup.php b/includes/Setup.php index 3d5bee2a08..68e3d96afe 100644 --- a/includes/Setup.php +++ b/includes/Setup.php @@ -282,6 +282,11 @@ foreach ( $wgForeignFileRepos as &$repo ) { } unset( $repo ); // no global pollution; destroy reference +// Convert this deprecated setting to modern system +if ( $wgExperimentalHtmlIds ) { + $wgFragmentMode = [ 'html5-legacy', 'legacy' ]; +} + $rcMaxAgeDays = $wgRCMaxAge / ( 3600 * 24 ); if ( $wgRCFilterByAge ) { // Trim down $wgRCLinkDays so that it only lists links which are valid diff --git a/includes/Title.php b/includes/Title.php index 0a2f86810b..7b3e3a5b68 100644 --- a/includes/Title.php +++ b/includes/Title.php @@ -748,6 +748,8 @@ class Title implements LinkTarget { /** * Escape a text fragment, say from a link, for a URL * + * @deprecated since 1.30, use Sanitizer::escapeIdForLink() or escapeIdForExternalInterwiki() + * * @param string $fragment Containing a URL or link fragment (after the "#") * @return string Escaped string */ @@ -1397,14 +1399,16 @@ class Title implements LinkTarget { /** * Get the fragment in URL form, including the "#" character if there is one + * * @return string Fragment in URL form */ public function getFragmentForURL() { if ( !$this->hasFragment() ) { return ''; - } else { - return '#' . self::escapeFragmentForURL( $this->getFragment() ); + } elseif ( $this->isExternal() && !$this->getTransWikiID() ) { + return '#' . Sanitizer::escapeIdForExternalInterwiki( $this->getFragment() ); } + return '#' . Sanitizer::escapeIdForLink( $this->getFragment() ); } /** diff --git a/includes/actions/InfoAction.php b/includes/actions/InfoAction.php index baec944e67..68dda37b6d 100644 --- a/includes/actions/InfoAction.php +++ b/includes/actions/InfoAction.php @@ -156,8 +156,8 @@ class InfoAction extends FormlessAction { * @return string The HTML. */ protected function makeHeader( $header, $canonicalId ) { - $spanAttribs = [ 'class' => 'mw-headline', 'id' => Sanitizer::escapeId( $header ) ]; - $h2Attribs = [ 'id' => Sanitizer::escapeId( $canonicalId ) ]; + $spanAttribs = [ 'class' => 'mw-headline', 'id' => Sanitizer::escapeIdForAttribute( $header ) ]; + $h2Attribs = [ 'id' => Sanitizer::escapeIdForAttribute( $canonicalId ) ]; return Html::rawElement( 'h2', $h2Attribs, Html::element( 'span', $spanAttribs, $header ) ); } diff --git a/includes/api/ApiMain.php b/includes/api/ApiMain.php index b7d452940d..646823573d 100644 --- a/includes/api/ApiMain.php +++ b/includes/api/ApiMain.php @@ -1931,14 +1931,15 @@ class ApiMain extends ApiBase { $header = $this->msg( 'api-help-datatypes-header' )->parse(); - // Add an additional span with sanitized ID - if ( !$this->getConfig()->get( 'ExperimentalHtmlIds' ) ) { - $header = Html::element( 'span', [ 'id' => Sanitizer::escapeId( 'main/datatypes' ) ] ) . - $header; - } - $help['datatypes'] .= Html::rawElement( 'h' . min( 6, $level ), - [ 'id' => 'main/datatypes', 'class' => 'apihelp-header' ], - $header + $id = Sanitizer::escapeIdForAttribute( 'main/datatypes', Sanitizer::ID_PRIMARY ); + $idFallback = Sanitizer::escapeIdForAttribute( 'main/datatypes', Sanitizer::ID_FALLBACK ); + + $help['datatypes'] .= Linker::makeHeadline( min( 6, $level ), + ' class="apihelp-header"', + $id, + $header, + '', + $idFallback ); $help['datatypes'] .= $this->msg( 'api-help-datatypes' )->parseAsBlock(); if ( !isset( $tocData['main/datatypes'] ) ) { @@ -1953,15 +1954,15 @@ class ApiMain extends ApiBase { ]; } - // Add an additional span with sanitized ID - if ( !$this->getConfig()->get( 'ExperimentalHtmlIds' ) ) { - $header = Html::element( 'span', [ 'id' => Sanitizer::escapeId( 'main/credits' ) ] ) . - $header; - } $header = $this->msg( 'api-credits-header' )->parse(); - $help['credits'] .= Html::rawElement( 'h' . min( 6, $level ), - [ 'id' => 'main/credits', 'class' => 'apihelp-header' ], - $header + $id = Sanitizer::escapeIdForAttribute( 'main/credits', Sanitizer::ID_PRIMARY ); + $idFallback = Sanitizer::escapeIdForAttribute( 'main/credits', Sanitizer::ID_FALLBACK ); + $help['credits'] .= Linker::makeHeadline( min( 6, $level ), + ' class="apihelp-header"', + $id, + $header, + '', + $idFallback ); $help['credits'] .= $this->msg( 'api-credits' )->useDatabase( false )->parseAsBlock(); if ( !isset( $tocData['main/credits'] ) ) { diff --git a/includes/htmlform/HTMLForm.php b/includes/htmlform/HTMLForm.php index d4351e0304..702c2ebc7a 100644 --- a/includes/htmlform/HTMLForm.php +++ b/includes/htmlform/HTMLForm.php @@ -1692,7 +1692,7 @@ class HTMLForm extends ContextSource { $attributes = []; if ( $fieldsetIDPrefix ) { - $attributes['id'] = Sanitizer::escapeId( "$fieldsetIDPrefix$key" ); + $attributes['id'] = Sanitizer::escapeIdForAttribute( "$fieldsetIDPrefix$key" ); } $subsectionHtml .= $this->wrapFieldSetSection( $legend, $section, $attributes ); } else { @@ -1741,7 +1741,7 @@ class HTMLForm extends ContextSource { ]; if ( $sectionName ) { - $attribs['id'] = Sanitizer::escapeId( $sectionName ); + $attribs['id'] = Sanitizer::escapeIdForAttribute( $sectionName ); } if ( $displayFormat === 'table' ) { diff --git a/includes/htmlform/HTMLFormField.php b/includes/htmlform/HTMLFormField.php index 7cb83e21cd..77ddc1a1b5 100644 --- a/includes/htmlform/HTMLFormField.php +++ b/includes/htmlform/HTMLFormField.php @@ -416,8 +416,8 @@ abstract class HTMLFormField { $this->mDir = $params['dir']; } - $validName = Sanitizer::escapeId( $this->mName ); - $validName = str_replace( [ '.5B', '.5D' ], [ '[', ']' ], $validName ); + $validName = urlencode( $this->mName ); + $validName = str_replace( [ '%5B', '%5D' ], [ '[', ']' ], $validName ); if ( $this->mName != $validName && !isset( $params['nodata'] ) ) { throw new MWException( "Invalid name '{$this->mName}' passed to " . __METHOD__ ); } @@ -430,7 +430,7 @@ abstract class HTMLFormField { if ( isset( $params['id'] ) ) { $id = $params['id']; - $validId = Sanitizer::escapeId( $id ); + $validId = urlencode( $id ); if ( $id != $validId ) { throw new MWException( "Invalid id '$id' passed to " . __METHOD__ ); diff --git a/includes/htmlform/OOUIHTMLForm.php b/includes/htmlform/OOUIHTMLForm.php index 9dd37b31ee..e47de61a4c 100644 --- a/includes/htmlform/OOUIHTMLForm.php +++ b/includes/htmlform/OOUIHTMLForm.php @@ -180,7 +180,7 @@ class OOUIHTMLForm extends HTMLForm { 'items' => $fieldsHtml, ]; if ( $sectionName ) { - $config['id'] = Sanitizer::escapeId( $sectionName ); + $config['id'] = Sanitizer::escapeIdForAttribute( $sectionName ); } if ( is_string( $this->mWrapperLegend ) ) { $config['label'] = $this->mWrapperLegend; diff --git a/includes/htmlform/fields/HTMLFormFieldCloner.php b/includes/htmlform/fields/HTMLFormFieldCloner.php index dd9184bf33..53c6835949 100644 --- a/includes/htmlform/fields/HTMLFormFieldCloner.php +++ b/includes/htmlform/fields/HTMLFormFieldCloner.php @@ -93,9 +93,9 @@ class HTMLFormFieldCloner extends HTMLFormField { $info['name'] = $name; } if ( isset( $info['id'] ) ) { - $info['id'] = Sanitizer::escapeId( "{$this->mID}--$key--{$info['id']}" ); + $info['id'] = Sanitizer::escapeIdForAttribute( "{$this->mID}--$key--{$info['id']}" ); } else { - $info['id'] = Sanitizer::escapeId( "{$this->mID}--$key--$fieldname" ); + $info['id'] = Sanitizer::escapeIdForAttribute( "{$this->mID}--$key--$fieldname" ); } // Copy the hide-if rules to "child" fields, so that the JavaScript code handling them // (resources/src/mediawiki/htmlform/hide-if.js) doesn't have to handle nested fields. @@ -313,7 +313,7 @@ class HTMLFormFieldCloner extends HTMLFormField { 'type' => 'submit', 'formnovalidate' => true, 'name' => $name, - 'id' => Sanitizer::escapeId( "{$this->mID}--$key--delete" ), + 'id' => Sanitizer::escapeIdForAttribute( "{$this->mID}--$key--delete" ), 'cssclass' => 'mw-htmlform-cloner-delete-button', 'default' => $this->getMessage( $label )->text(), ], $this->mParent ); @@ -386,7 +386,7 @@ class HTMLFormFieldCloner extends HTMLFormField { 'type' => 'submit', 'formnovalidate' => true, 'name' => $name, - 'id' => Sanitizer::escapeId( "{$this->mID}--create" ), + 'id' => Sanitizer::escapeIdForAttribute( "{$this->mID}--create" ), 'cssclass' => 'mw-htmlform-cloner-create-button', 'default' => $this->getMessage( $label )->text(), ], $this->mParent ); diff --git a/includes/htmlform/fields/HTMLRadioField.php b/includes/htmlform/fields/HTMLRadioField.php index 06ec3722ed..77ea7cd211 100644 --- a/includes/htmlform/fields/HTMLRadioField.php +++ b/includes/htmlform/fields/HTMLRadioField.php @@ -90,7 +90,7 @@ class HTMLRadioField extends HTMLFormField { $html .= Html::rawElement( 'h1', [], $label ) . "\n"; $html .= $this->formatOptions( $info, $value ); } else { - $id = Sanitizer::escapeId( $this->mID . "-$info" ); + $id = Sanitizer::escapeIdForAttribute( $this->mID . "-$info" ); $classes = [ 'mw-htmlform-flatlist-item' ]; if ( $wgUseMediaWikiUIEverywhere || $this->mParent instanceof VFormHTMLForm ) { $classes[] = 'mw-ui-radio'; diff --git a/includes/page/ImagePage.php b/includes/page/ImagePage.php index d37700b58d..b870831e5f 100644 --- a/includes/page/ImagePage.php +++ b/includes/page/ImagePage.php @@ -254,15 +254,16 @@ class ImagePage extends Article { $r .= "\n"; foreach ( $metadata as $type => $stuff ) { foreach ( $stuff as $v ) { - # @todo FIXME: Why is this using escapeId for a class?! - $class = Sanitizer::escapeId( $v['id'] ); + $class = str_replace( ' ', '_', $v['id'] ); if ( $type == 'collapsed' ) { // Handled by mediawiki.action.view.metadata module. $class .= ' collapsable'; } - $r .= "\n"; - $r .= "\n"; - $r .= "\n"; + $r .= Html::rawElement( 'tr', + [ 'class' => $class ], + Html::rawElement( 'th', [], $v['name'] ) + . Html::rawElement( 'td', [], $v['value'] ) + ); } } $r .= "
{$v['name']}{$v['value']}
\n\n"; diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index b035b0243d..88439db98a 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -4035,7 +4035,7 @@ class Parser { * @private */ public function formatHeadings( $text, $origText, $isMain = true ) { - global $wgMaxTocLevel, $wgExperimentalHtmlIds; + global $wgMaxTocLevel; # Inhibit editsection links if requested in the page if ( isset( $this->mDoubleUnderscores['noeditsection'] ) ) { @@ -4229,61 +4229,44 @@ class Parser { # Save headline for section edit hint before it's escaped $headlineHint = $safeHeadline; - if ( $wgExperimentalHtmlIds ) { - # For reverse compatibility, provide an id that's - # HTML4-compatible, like we used to. - # It may be worth noting, academically, that it's possible for - # the legacy anchor to conflict with a non-legacy headline - # anchor on the page. In this case likely the "correct" thing - # would be to either drop the legacy anchors or make sure - # they're numbered first. However, this would require people - # to type in section names like "abc_.D7.93.D7.90.D7.A4" - # manually, so let's not bother worrying about it. - $legacyHeadline = Sanitizer::escapeId( $safeHeadline, - [ 'noninitial', 'legacy' ] ); - $safeHeadline = Sanitizer::escapeId( $safeHeadline ); - - if ( $legacyHeadline == $safeHeadline ) { - # No reason to have both (in fact, we can't) - $legacyHeadline = false; - } - } else { - $legacyHeadline = false; - $safeHeadline = Sanitizer::escapeId( $safeHeadline, - 'noninitial' ); + $fallbackHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_FALLBACK ); + $linkAnchor = Sanitizer::escapeIdForLink( $safeHeadline ); + $safeHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_PRIMARY ); + if ( $fallbackHeadline === $safeHeadline ) { + # No reason to have both (in fact, we can't) + $fallbackHeadline = false; } - # HTML names must be case-insensitively unique (T12721). - # This does not apply to Unicode characters per - # https://www.w3.org/TR/html5/infrastructure.html#case-sensitivity-and-string-comparison + # HTML IDs must be case-insensitively unique for IE compatibility (T12721). # @todo FIXME: We may be changing them depending on the current locale. $arrayKey = strtolower( $safeHeadline ); - if ( $legacyHeadline === false ) { - $legacyArrayKey = false; + if ( $fallbackHeadline === false ) { + $fallbackArrayKey = false; } else { - $legacyArrayKey = strtolower( $legacyHeadline ); + $fallbackArrayKey = strtolower( $fallbackHeadline ); } # Create the anchor for linking from the TOC to the section $anchor = $safeHeadline; - $legacyAnchor = $legacyHeadline; + $fallbackAnchor = $fallbackHeadline; if ( isset( $refers[$arrayKey] ) ) { // @codingStandardsIgnoreStart for ( $i = 2; isset( $refers["${arrayKey}_$i"] ); ++$i ); // @codingStandardsIgnoreEnd $anchor .= "_$i"; + $linkAnchor .= "_$i"; $refers["${arrayKey}_$i"] = true; } else { $refers[$arrayKey] = true; } - if ( $legacyHeadline !== false && isset( $refers[$legacyArrayKey] ) ) { + if ( $fallbackHeadline !== false && isset( $refers[$fallbackArrayKey] ) ) { // @codingStandardsIgnoreStart - for ( $i = 2; isset( $refers["${legacyArrayKey}_$i"] ); ++$i ); + for ( $i = 2; isset( $refers["${fallbackArrayKey}_$i"] ); ++$i ); // @codingStandardsIgnoreEnd - $legacyAnchor .= "_$i"; - $refers["${legacyArrayKey}_$i"] = true; + $fallbackAnchor .= "_$i"; + $refers["${fallbackArrayKey}_$i"] = true; } else { - $refers[$legacyArrayKey] = true; + $refers[$fallbackArrayKey] = true; } # Don't number the heading if it is the only one (looks silly) @@ -4297,7 +4280,7 @@ class Parser { } if ( $enoughToc && ( !isset( $wgMaxTocLevel ) || $toclevel < $wgMaxTocLevel ) ) { - $toc .= Linker::tocLine( $anchor, $tocline, + $toc .= Linker::tocLine( $linkAnchor, $tocline, $numbering, $toclevel, ( $isTemplate ? false : $sectionIndex ) ); } @@ -4364,7 +4347,7 @@ class Parser { } $head[$headlineCount] = Linker::makeHeadline( $level, $matches['attrib'][$headlineCount], $anchor, $headline, - $editlink, $legacyAnchor ); + $editlink, $fallbackAnchor ); $headlineCount++; } @@ -5806,22 +5789,33 @@ class Parser { # Strip out wikitext links(they break the anchor) $text = $this->stripSectionName( $text ); $text = Sanitizer::normalizeSectionNameWhitespace( $text ); - return '#' . Sanitizer::escapeId( $text, 'noninitial' ); + return '#' . Sanitizer::escapeIdForLink( $text ); } /** * Same as guessSectionNameFromWikiText(), but produces legacy anchors - * instead. For use in redirects, since IE6 interprets Redirect: headers - * as something other than UTF-8 (apparently?), resulting in breakage. + * instead, if possible. For use in redirects, since various versions + * of Microsoft browsers interpret Location: headers as something other + * than UTF-8, resulting in breakage. * * @param string $text The section name * @return string An anchor */ public function guessLegacySectionNameFromWikiText( $text ) { + global $wgFragmentMode; + # Strip out wikitext links(they break the anchor) $text = $this->stripSectionName( $text ); $text = Sanitizer::normalizeSectionNameWhitespace( $text ); - return '#' . Sanitizer::escapeId( $text, [ 'noninitial', 'legacy' ] ); + + if ( isset( $wgFragmentMode[1] ) && $wgFragmentMode[1] === 'legacy' ) { + // ForAttribute() and ForLink() are the same for legacy encoding + $id = Sanitizer::escapeIdForAttribute( $text, Sanitizer::ID_FALLBACK ); + } else { + $id = Sanitizer::escapeIdForLink( $text ); + } + + return "#$id"; } /** diff --git a/includes/resourceloader/ResourceLoaderMediaWikiUtilModule.php b/includes/resourceloader/ResourceLoaderMediaWikiUtilModule.php new file mode 100644 index 0000000000..1fe3434466 --- /dev/null +++ b/includes/resourceloader/ResourceLoaderMediaWikiUtilModule.php @@ -0,0 +1,46 @@ + $this->getConfig()->get( 'FragmentMode' ) ] + ) + . "\n" + . parent::getScript( $context ); + } + + /** + * @inheritdoc + */ + public function enableModuleContentVersion() { + return true; + } +} diff --git a/includes/skins/BaseTemplate.php b/includes/skins/BaseTemplate.php index 0b7fc2f5d3..aad676f8e3 100644 --- a/includes/skins/BaseTemplate.php +++ b/includes/skins/BaseTemplate.php @@ -678,7 +678,7 @@ abstract class BaseTemplate extends QuickTemplate { } foreach ( $validFooterIcons as $blockName => $footerIcons ) { $html .= Html::openElement( 'div', [ - 'id' => 'f-' . Sanitizer::escapeId( $blockName ) . 'ico', + 'id' => Sanitizer::escapeIdForAttribute( "f-{$blockName}ico" ), 'class' => 'footer-icons' ] ); foreach ( $footerIcons as $icon ) { @@ -691,7 +691,7 @@ abstract class BaseTemplate extends QuickTemplate { foreach ( $validFooterLinks as $aLink ) { $html .= Html::rawElement( 'li', - [ 'id' => Sanitizer::escapeId( $aLink ) ], + [ 'id' => Sanitizer::escapeIdForAttribute( $aLink ) ], $this->get( $aLink ) ); } @@ -734,7 +734,7 @@ abstract class BaseTemplate extends QuickTemplate { $out .= Html::rawElement( 'div', [ - 'id' => Sanitizer::escapeId( "mw-indicator-$id" ), + 'id' => Sanitizer::escapeIdForAttribute( "mw-indicator-$id" ), 'class' => 'mw-indicator', ], $content diff --git a/includes/skins/Skin.php b/includes/skins/Skin.php index 40905a56df..849362aae7 100644 --- a/includes/skins/Skin.php +++ b/includes/skins/Skin.php @@ -1375,8 +1375,8 @@ abstract class Skin extends ContextSource { $bar[$heading][] = array_merge( [ 'text' => $text, 'href' => $href, - 'id' => 'n-' . Sanitizer::escapeId( strtr( $line[1], ' ', '-' ), 'noninitial' ), - 'active' => false + 'id' => Sanitizer::escapeIdForAttribute( 'n-' . strtr( $line[1], ' ', '-' ) ), + 'active' => false, ], $extraAttribs ); } else { continue; diff --git a/includes/specials/SpecialListgrants.php b/includes/specials/SpecialListgrants.php index 2c92410cbc..1a04eec473 100644 --- a/includes/specials/SpecialListgrants.php +++ b/includes/specials/SpecialListgrants.php @@ -69,7 +69,7 @@ class SpecialListGrants extends SpecialPage { $grantCellHtml = ''; } - $id = \Sanitizer::escapeId( $grant ); + $id = Sanitizer::escapeIdForAttribute( $grant ); $out->addHTML( \Html::rawElement( 'tr', [ 'id' => $id ], "" . $this->msg( diff --git a/includes/specials/SpecialListgrouprights.php b/includes/specials/SpecialListgrouprights.php index 7a25e55d58..2315887aa5 100644 --- a/includes/specials/SpecialListgrouprights.php +++ b/includes/specials/SpecialListgrouprights.php @@ -126,7 +126,7 @@ class SpecialListGroupRights extends SpecialPage { ? $groupsRemoveFromSelf[$group] : []; - $id = $group == '*' ? false : Sanitizer::escapeId( $group ); + $id = $group == '*' ? false : Sanitizer::escapeIdForAttribute( $group ); $out->addHTML( Html::rawElement( 'tr', [ 'id' => $id ], " $grouppage$grouplink " . diff --git a/includes/specials/SpecialVersion.php b/includes/specials/SpecialVersion.php index 30c4a0be8f..3ea1d03aeb 100644 --- a/includes/specials/SpecialVersion.php +++ b/includes/specials/SpecialVersion.php @@ -840,7 +840,7 @@ class SpecialVersion extends SpecialPage { // Finally! Create the table $html = Html::openElement( 'tr', [ 'class' => 'mw-version-ext', - 'id' => Sanitizer::escapeId( 'mw-version-ext-' . $type . '-' . $extension['name'] ) + 'id' => Sanitizer::escapeIdForAttribute( 'mw-version-ext-' . $type . '-' . $extension['name'] ) ] ); diff --git a/includes/specials/pagers/AllMessagesTablePager.php b/includes/specials/pagers/AllMessagesTablePager.php index ca1b7dca9d..e6a0f0be51 100644 --- a/includes/specials/pagers/AllMessagesTablePager.php +++ b/includes/specials/pagers/AllMessagesTablePager.php @@ -375,7 +375,9 @@ class AllMessagesTablePager extends TablePager { } if ( !$isSecond ) { - $arr['id'] = Sanitizer::escapeId( 'msg_' . $this->getLanguage()->lcfirst( $row->am_title ) ); + $arr['id'] = Sanitizer::escapeIdForAttribute( + 'msg_' . $this->getLanguage()->lcfirst( $row->am_title ) + ); } return $arr; diff --git a/resources/Resources.php b/resources/Resources.php index 9860328f50..1663b50b87 100644 --- a/resources/Resources.php +++ b/resources/Resources.php @@ -1379,6 +1379,7 @@ return [ ] ], 'mediawiki.util' => [ + 'class' => 'ResourceLoaderMediaWikiUtilModule', 'scripts' => 'resources/src/mediawiki/mediawiki.util.js', 'dependencies' => [ 'jquery.accessKeyLabel', diff --git a/resources/src/mediawiki.action/mediawiki.action.edit.preview.js b/resources/src/mediawiki.action/mediawiki.action.edit.preview.js index 2b6fc9d8aa..706e1feaa9 100644 --- a/resources/src/mediawiki.action/mediawiki.action.edit.preview.js +++ b/resources/src/mediawiki.action/mediawiki.action.edit.preview.js @@ -150,7 +150,7 @@ newList.push( $( '
' ) .addClass( 'mw-indicator' ) - .attr( 'id', mw.util.escapeId( 'mw-indicator-' + name ) ) + .attr( 'id', mw.util.escapeIdForAttribute( 'mw-indicator-' + name ) ) .html( indicator ) .get( 0 ), // Add a whitespace between the
s because diff --git a/resources/src/mediawiki/mediawiki.util.js b/resources/src/mediawiki/mediawiki.util.js index 4844e21498..d0ec58501c 100644 --- a/resources/src/mediawiki/mediawiki.util.js +++ b/resources/src/mediawiki/mediawiki.util.js @@ -1,12 +1,59 @@ ( function ( mw, $ ) { 'use strict'; + var util; + + /** + * Encode the string like PHP's rawurlencode + * @ignore + * + * @param {string} str String to be encoded. + * @return {string} Encoded string + */ + function rawurlencode( str ) { + str = String( str ); + return encodeURIComponent( str ) + .replace( /!/g, '%21' ).replace( /'/g, '%27' ).replace( /\(/g, '%28' ) + .replace( /\)/g, '%29' ).replace( /\*/g, '%2A' ).replace( /~/g, '%7E' ); + } + + /** + * Private helper function used by util.escapeId*() + * @ignore + * + * @param {string} str String to be encoded + * @param {string} mode Encoding mode, see documentation for $wgFragmentMode + * in DefaultSettings.php + * @return {string} Encoded string + */ + function escapeIdInternal( str, mode ) { + str = String( str ); + + switch ( mode ) { + case 'html5': + return str.replace( / /g, '_' ); + case 'html5-legacy': + str = str.replace( /[ \t\n\r\f_'"&#%]+/g, '_' ) + .replace( /^_+|_+$/, '' ); + if ( str === '' ) { + str = '_'; + } + return str; + case 'legacy': + return rawurlencode( str.replace( / /g, '_' ) ) + .replace( /%3A/g, ':' ) + .replace( /%/g, '.' ); + default: + throw new Error( 'Unrecognized ID escaping mode ' + mode ); + } + } + /** * Utility library * @class mw.util * @singleton */ - var util = { + util = { /* Main body */ @@ -16,24 +63,52 @@ * @param {string} str String to be encoded. * @return {string} Encoded string */ - rawurlencode: function ( str ) { - str = String( str ); - return encodeURIComponent( str ) - .replace( /!/g, '%21' ).replace( /'/g, '%27' ).replace( /\(/g, '%28' ) - .replace( /\)/g, '%29' ).replace( /\*/g, '%2A' ).replace( /~/g, '%7E' ); - }, + rawurlencode: rawurlencode, /** - * Encode the string like Sanitizer::escapeId in PHP + * Encode the string like Sanitizer::escapeId() in PHP + * @deprecated since 1.30 use escapeIdForAttribute() or escapeIdForLink() * * @param {string} str String to be encoded. * @return {string} Encoded string */ escapeId: function ( str ) { - str = String( str ); - return util.rawurlencode( str.replace( / /g, '_' ) ) - .replace( /%3A/g, ':' ) - .replace( /%/g, '.' ); + return escapeIdInternal( str, 'legacy' ); + }, + + /** + * Encode string into HTML id compatible form suitable for use in HTML + * Analog to PHP Sanitizer::escapeIdForAttribute() + * + * @since 1.30 + * + * @param {string} str String to encode + * @return {string} Encoded string + */ + escapeIdForAttribute: function ( str ) { + var mode = mw.config.get( 'wgFragmentMode' )[ 0 ]; + + return escapeIdInternal( str, mode ); + }, + + /** + * Encode string into HTML id compatible form suitable for use in links + * Analog to PHP Sanitizer::escapeIdForLink() + * + * @since 1.30 + * + * @param {string} str String to encode + * @return {string} Encoded string + */ + escapeIdForLink: function ( str ) { + var mode = mw.config.get( 'wgFragmentMode' )[ 0 ], + id = escapeIdInternal( str, mode ); + + if ( mode === 'html5' ) { + id = encodeURIComponent( id ).replace( /%3A/g, ':' ); + } + + return id; }, /** @@ -101,7 +176,7 @@ // Append the encoded fragment if ( fragment.length ) { - url += '#' + util.escapeId( fragment ); + url += '#' + util.escapeIdForLink( fragment ); } return url; diff --git a/tests/parser/ParserTestRunner.php b/tests/parser/ParserTestRunner.php index feed77fafd..e578418542 100644 --- a/tests/parser/ParserTestRunner.php +++ b/tests/parser/ParserTestRunner.php @@ -1055,6 +1055,8 @@ class ParserTestRunner { // wgEnableMagicLinks={"ISBN":false, "PMID":false, "RFC":false} 'wgEnableMagicLinks' => self::getOptionValue( 'wgEnableMagicLinks', $opts, [] ) + [ 'ISBN' => true, 'PMID' => true, 'RFC' => true ], + // Test with legacy encoding by default until HTML5 is very stable and default + 'wgFragmentMode' => [ 'legacy' ], ]; if ( $config ) { diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index f8ba742c99..ab79b59409 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -28386,3 +28386,144 @@ showindicators 1&2&3&4&amp;5=Indicator !! end + +!! test +HTML5 ids: fallback to legacy +!! config +wgFragmentMode=[ 'html5', 'legacy' ] +!! wikitext +== Foo bar == + +== foo Bar == + +== Тест == + +== Тест == + +== тест == + +== Hey < # " > % : ' == +[[#Foo bar]] [[#foo Bar]] [[#Тест]] [[#тест]] [[#Hey < # " > % : ']] + +{{anchorencode:💩}} + + +[[#啤酒]] [[#%E5%95%A4%E9%85%92]] + +!! html/php + + +

Foo bar[edit]

+

foo Bar[edit]

+

Тест[edit]

+

Тест[edit]

+

тест[edit]

+

Hey < # " > % : '[edit]

+

#Foo bar #foo Bar #Тест #тест #Hey < # " > % : ' +

%F0%9F%92%A9 +

#啤酒 #啤酒 +

+!! end + +!! test +HTML5 ids: legacy with a fallback to modern +!! config +wgFragmentMode=[ 'legacy', 'html5' ] +!! wikitext +== Foo bar == + +== foo Bar == + +== Тест == + +== Тест == + +== тест == + +== Hey < # " > % : ' == +[[#Foo bar]] [[#foo Bar]] [[#Тест]] [[#тест]] [[#Hey < # " > % : ']] + +{{anchorencode:💩}} + + +[[#啤酒]] [[#%E5%95%A4%E9%85%92]] + +!! html/php + + +

Foo bar[edit]

+

foo Bar[edit]

+

Тест[edit]

+

Тест[edit]

+

тест[edit]

+

Hey < # " > % : '[edit]

+

#Foo bar #foo Bar #Тест #тест #Hey < # " > % : ' +

.F0.9F.92.A9 +

#啤酒 #啤酒 +

+!! end + +!! test +HTML5 ids: no legacy +!! config +wgFragmentMode=[ 'html5' ] +!! wikitext +== Foo bar == + +== foo Bar == + +== Тест == + +== Тест == + +== тест == + +== Hey < # " > % : ' == +[[#Foo bar]] [[#foo Bar]] [[#Тест]] [[#тест]] [[#Hey < # " > % : ']] + +{{anchorencode:💩}} + + +[[#啤酒]] [[#%E5%95%A4%E9%85%92]] + +!! html/php + + +

Foo bar[edit]

+

foo Bar[edit]

+

Тест[edit]

+

Тест[edit]

+

тест[edit]

+

Hey < # " > % : '[edit]

+

#Foo bar #foo Bar #Тест #тест #Hey < # " > % : ' +

%F0%9F%92%A9 +

#啤酒 #啤酒 +

+!! end diff --git a/tests/phpunit/includes/SanitizerTest.php b/tests/phpunit/includes/SanitizerTest.php index 6d093b0f13..d5066235ec 100644 --- a/tests/phpunit/includes/SanitizerTest.php +++ b/tests/phpunit/includes/SanitizerTest.php @@ -3,6 +3,8 @@ /** * @todo Tests covering decodeCharReferences can be refactored into a single * method and dataprovider. + * + * @group Sanitizer */ class SanitizerTest extends MediaWikiTestCase { @@ -379,7 +381,7 @@ class SanitizerTest extends MediaWikiTestCase { } /** - * Test escapeIdReferenceList for consistency with escapeId + * Test escapeIdReferenceList for consistency with escapeIdForAttribute * * @dataProvider provideEscapeIdReferenceList * @covers Sanitizer::escapeIdReferenceList @@ -387,9 +389,9 @@ class SanitizerTest extends MediaWikiTestCase { public function testEscapeIdReferenceList( $referenceList, $id1, $id2 ) { $this->assertEquals( Sanitizer::escapeIdReferenceList( $referenceList, 'noninitial' ), - Sanitizer::escapeId( $id1, 'noninitial' ) + Sanitizer::escapeIdForAttribute( $id1 ) . ' ' - . Sanitizer::escapeId( $id2, 'noninitial' ) + . Sanitizer::escapeIdForAttribute( $id2 ) ); } @@ -422,4 +424,119 @@ class SanitizerTest extends MediaWikiTestCase { [ 'data-mwfoo', true ], // could be false but this is how it's implemented currently ]; } + + /** + * @dataProvider provideEscapeIdForStuff + * + * @covers Sanitizer::escapeIdForAttribute() + * @covers Sanitizer::escapeIdForLink() + * @covers Sanitizer::escapeIdForExternalInterwiki() + * @covers Sanitizer::escapeIdInternal() + * @covers Sanitizer::urlEscapeId() + * + * @param string $stuff + * @param string[] $config + * @param string $id + * @param string|false $expected + * @param int|null $mode + */ + public function testEscapeIdForStuff( $stuff, array $config, $id, $expected, $mode = null ) { + $func = "Sanitizer::escapeIdFor{$stuff}"; + $iwFlavor = array_pop( $config ); + $this->setMwGlobals( [ + 'wgFragmentMode' => $config, + 'wgExternalInterwikiFragmentMode' => $iwFlavor, + ] ); + $escaped = call_user_func( $func, $id, $mode ); + self::assertEquals( $expected, $escaped ); + } + + public function provideEscapeIdForStuff() { + // Test inputs and outputs + $text = 'foo тест_#%!\'()[]:<>'; + $legacyEncoded = 'foo_.D1.82.D0.B5.D1.81.D1.82_.23.25.21.27.28.29.5B.5D:.3C.3E'; + $html5Encoded = 'foo_тест_#%!\'()[]:<>'; + $html5Escaped = 'foo_%D1%82%D0%B5%D1%81%D1%82_%23%25%21%27%28%29%5B%5D:%3C%3E'; + $html5Experimental = 'foo_тест_!_()[]:<>'; + + // Settings: last element is $wgExternalInterwikiFragmentMode, the rest is $wgFragmentMode + $legacy = [ 'legacy', 'legacy' ]; + $legacyNew = [ 'legacy', 'html5', 'legacy' ]; + $newLegacy = [ 'html5', 'legacy', 'legacy' ]; + $new = [ 'html5', 'legacy' ]; + $allNew = [ 'html5', 'html5' ]; + $experimentalLegacy = [ 'html5-legacy', 'legacy', 'legacy' ]; + $newExperimental = [ 'html5', 'html5-legacy', 'legacy' ]; + + return [ + // Pure legacy: how MW worked before 2017 + [ 'Attribute', $legacy, $text, $legacyEncoded, Sanitizer::ID_PRIMARY ], + [ 'Attribute', $legacy, $text, false, Sanitizer::ID_FALLBACK ], + [ 'Link', $legacy, $text, $legacyEncoded ], + [ 'ExternalInterwiki', $legacy, $text, $legacyEncoded ], + + // Transition to a new world: legacy links with HTML5 fallback + [ 'Attribute', $legacyNew, $text, $legacyEncoded, Sanitizer::ID_PRIMARY ], + [ 'Attribute', $legacyNew, $text, $html5Encoded, Sanitizer::ID_FALLBACK ], + [ 'Link', $legacyNew, $text, $legacyEncoded ], + [ 'ExternalInterwiki', $legacyNew, $text, $legacyEncoded ], + + // New world: HTML5 links, legacy fallbacks + [ 'Attribute', $newLegacy, $text, $html5Encoded, Sanitizer::ID_PRIMARY ], + [ 'Attribute', $newLegacy, $text, $legacyEncoded, Sanitizer::ID_FALLBACK ], + [ 'Link', $newLegacy, $text, $html5Escaped ], + [ 'ExternalInterwiki', $newLegacy, $text, $legacyEncoded ], + + // Distant future: no legacy fallbacks, but still linking to leagacy wikis + [ 'Attribute', $new, $text, $html5Encoded, Sanitizer::ID_PRIMARY ], + [ 'Attribute', $new, $text, false, Sanitizer::ID_FALLBACK ], + [ 'Link', $new, $text, $html5Escaped ], + [ 'ExternalInterwiki', $new, $text, $legacyEncoded ], + + // Just before the heat death of universe: external interwikis are also HTML5 \m/ + [ 'Attribute', $allNew, $text, $html5Encoded, Sanitizer::ID_PRIMARY ], + [ 'Attribute', $allNew, $text, false, Sanitizer::ID_FALLBACK ], + [ 'Link', $allNew, $text, $html5Escaped ], + [ 'ExternalInterwiki', $allNew, $text, $html5Escaped ], + + // Someone flipped $wgExperimentalHtmlIds on + [ 'Attribute', $experimentalLegacy, $text, $html5Experimental, Sanitizer::ID_PRIMARY ], + [ 'Attribute', $experimentalLegacy, $text, $legacyEncoded, Sanitizer::ID_FALLBACK ], + [ 'Link', $experimentalLegacy, $text, $html5Experimental ], + [ 'ExternalInterwiki', $experimentalLegacy, $text, $legacyEncoded ], + + // Migration from $wgExperimentalHtmlIds to modern HTML5 + [ 'Attribute', $newExperimental, $text, $html5Encoded, Sanitizer::ID_PRIMARY ], + [ 'Attribute', $newExperimental, $text, $html5Experimental, Sanitizer::ID_FALLBACK ], + [ 'Link', $newExperimental, $text, $html5Escaped ], + [ 'ExternalInterwiki', $newExperimental, $text, $legacyEncoded ], + ]; + } + + /** + * @expectedException InvalidArgumentException + * @covers Sanitizer::escapeIdInternal() + */ + public function testInvalidFragmentThrows() { + $this->setMwGlobals( 'wgFragmentMode', [ 'boom!' ] ); + Sanitizer::escapeIdForAttribute( 'This should throw' ); + } + + /** + * @expectedException UnexpectedValueException + * @covers Sanitizer::escapeIdForAttribute() + */ + public function testNoPrimaryFragmentModeThrows() { + $this->setMwGlobals( 'wgFragmentMode', [ 666 => 'html5' ] ); + Sanitizer::escapeIdForAttribute( 'This should throw' ); + } + + /** + * @expectedException UnexpectedValueException + * @covers Sanitizer::escapeIdForLink() + */ + public function testNoPrimaryFragmentModeThrows2() { + $this->setMwGlobals( 'wgFragmentMode', [ 666 => 'html5' ] ); + Sanitizer::escapeIdForLink( 'This should throw' ); + } } diff --git a/tests/qunit/suites/resources/mediawiki/mediawiki.util.test.js b/tests/qunit/suites/resources/mediawiki/mediawiki.util.test.js index da04c8d541..2efe9cd7f4 100644 --- a/tests/qunit/suites/resources/mediawiki/mediawiki.util.test.js +++ b/tests/qunit/suites/resources/mediawiki/mediawiki.util.test.js @@ -93,7 +93,7 @@ } ); QUnit.test( 'escapeId', function ( assert ) { - mw.config.set( 'wgExperimentalHtmlIds', false ); + mw.config.set( 'wgFragmentMode', [ 'legacy' ] ); $.each( { '+': '.2B', '&': '.26', @@ -117,6 +117,75 @@ } ); } ); + QUnit.test( 'escapeIdForAttribute', function ( assert ) { + // Test cases are kept in sync with SanitizerTest.php + var text = 'foo тест_#%!\'()[]:<>', + legacyEncoded = 'foo_.D1.82.D0.B5.D1.81.D1.82_.23.25.21.27.28.29.5B.5D:.3C.3E', + html5Encoded = 'foo_тест_#%!\'()[]:<>', + html5Experimental = 'foo_тест_!_()[]:<>', + // Settings: this is $wgFragmentMode + legacy = [ 'legacy' ], + legacyNew = [ 'legacy', 'html5' ], + newLegacy = [ 'html5', 'legacy' ], + allNew = [ 'html5' ], + experimentalLegacy = [ 'html5-legacy', 'legacy' ], + newExperimental = [ 'html5', 'html5-legacy' ]; + + // Test cases are kept in sync with SanitizerTest.php + $.each( [ + // Pure legacy: how MW worked before 2017 + [ legacy, text, legacyEncoded ], + // Transition to a new world: legacy links with HTML5 fallback + [ legacyNew, text, legacyEncoded ], + // New world: HTML5 links, legacy fallbacks + [ newLegacy, text, html5Encoded ], + // Distant future: no legacy fallbacks + [ allNew, text, html5Encoded ], + // Someone flipped $wgExperimentalHtmlIds on + [ experimentalLegacy, text, html5Experimental ], + // Migration from $wgExperimentalHtmlIds to modern HTML5 + [ newExperimental, text, html5Encoded ] + ], function ( index, testCase ) { + mw.config.set( 'wgFragmentMode', testCase[ 0 ] ); + + assert.equal( util.escapeIdForAttribute( testCase[ 1 ] ), testCase[ 2 ] ); + } ); + } ); + + QUnit.test( 'escapeIdForLink', function ( assert ) { + // Test cases are kept in sync with SanitizerTest.php + var text = 'foo тест_#%!\'()[]:<>', + legacyEncoded = 'foo_.D1.82.D0.B5.D1.81.D1.82_.23.25.21.27.28.29.5B.5D:.3C.3E', + html5Escaped = 'foo_%D1%82%D0%B5%D1%81%D1%82_%23%25!\'()%5B%5D:%3C%3E', + html5Experimental = 'foo_тест_!_()[]:<>', + // Settings: this is wgFragmentMode + legacy = [ 'legacy' ], + legacyNew = [ 'legacy', 'html5' ], + newLegacy = [ 'html5', 'legacy' ], + allNew = [ 'html5' ], + experimentalLegacy = [ 'html5-legacy', 'legacy' ], + newExperimental = [ 'html5', 'html5-legacy' ]; + + $.each( [ + // Pure legacy: how MW worked before 2017 + [ legacy, text, legacyEncoded ], + // Transition to a new world: legacy links with HTML5 fallback + [ legacyNew, text, legacyEncoded ], + // New world: HTML5 links, legacy fallbacks + [ newLegacy, text, html5Escaped ], + // Distant future: no legacy fallbacks + [ allNew, text, html5Escaped ], + // Someone flipped wgExperimentalHtmlIds on + [ experimentalLegacy, text, html5Experimental ], + // Migration from wgExperimentalHtmlIds to modern HTML5 + [ newExperimental, text, html5Escaped ] + ], function ( index, testCase ) { + mw.config.set( 'wgFragmentMode', testCase[ 0 ] ); + + assert.equal( util.escapeIdForLink( testCase[ 1 ] ), testCase[ 2 ] ); + } ); + } ); + QUnit.test( 'wikiUrlencode', function ( assert ) { assert.equal( util.wikiUrlencode( 'Test:A & B/Here' ), 'Test:A_%26_B/Here' ); // See also wfUrlencodeTest.php#provideURLS -- 2.20.1