From: Brad Jorsch Date: Wed, 24 Aug 2016 18:07:43 +0000 (-0400) Subject: API: Warn when input parameters are normalized X-Git-Tag: 1.31.0-rc.0~5836^2 X-Git-Url: http://git.cyclocoop.org/%22%20.%20generer_url_ecrire%28%22statistiques_visites%22%2C%22%22%29%20.%20%22?a=commitdiff_plain;h=087e25021be29e882dfc6e2e0197a531acd2f9d8;p=lhc%2Fweb%2Fwiklou.git API: Warn when input parameters are normalized If a client submits data that is not NFC-normalized Unicode or that contains C0 controls other than HT, LF, and CR, it gets normalized before the API ever sees it. Which can lead to difficult-to-handle bugs when, for example, a title is subject to normalization so the client can't find the specific title it submitted anywhere in the response (T139130). This patch does two things: * Detects when normalization was applied to an input value (at the MediaWiki level, anyway; if PHP or earlier does it we're just out of luck) and add a warning to that effect. * For ApiPageSet's 'titles' parameter, split into the individual titles and add them to the 'normalized' list in the response. This requires encoding the pre-normalized strings to avoid ApiResult's own normalization. Bug: T29849 Bug: T144071 Change-Id: I215fd3edd7a5e1b45292e60768bf6dd5ad7f34de --- diff --git a/RELEASE-NOTES-1.28 b/RELEASE-NOTES-1.28 index 58d9a0791e..416b2d5b84 100644 --- a/RELEASE-NOTES-1.28 +++ b/RELEASE-NOTES-1.28 @@ -90,6 +90,12 @@ production. * (T141960) Multi-valued parameters may now be separated using U+001F (Unit Separator) instead of the pipe character. This will be useful if some of the multiple values need to contain pipes, e.g. for action=options. +* The API will now warn if input is not NFC-normalized Unicode or if it + contains invalid characters. +* The 'normalized' list output by action=query and other modules that use + ApiPageSet may contain entries where the 'from' value is percent-encoded as + the raw value cannot be represented in a valid API response. These are + indicated by a 'fromencoded' boolean alongside the existing 'from' parameter. === Action API internal changes in 1.28 === * Added a new hook, 'ApiMakeParserOptions', to allow extensions to better diff --git a/includes/api/ApiBase.php b/includes/api/ApiBase.php index ca890913f1..55d243086c 100644 --- a/includes/api/ApiBase.php +++ b/includes/api/ApiBase.php @@ -1020,6 +1020,11 @@ abstract class ApiBase extends ContextSource { ); } } + + // Check for NFC normalization, and warn + if ( $rawValue !== $value ) { + $this->handleParamNormalization( $paramName, $value, $rawValue ); + } } if ( isset( $value ) && ( $multi || is_array( $type ) ) ) { @@ -1165,6 +1170,22 @@ abstract class ApiBase extends ContextSource { return $value; } + /** + * Handle when a parameter was Unicode-normalized + * @since 1.28 + * @param string $paramName Unprefixed parameter name + * @param string $value Input that will be used. + * @param string $rawValue Input before normalization. + */ + protected function handleParamNormalization( $paramName, $value, $rawValue ) { + $encParamName = $this->encodeParamName( $paramName ); + $this->setWarning( + "The value passed for '$encParamName' contains invalid or non-normalized data. " + . 'Textual data should be valid, NFC-normalized Unicode without ' + . 'C0 control characters other than HT (\\t), LF (\\n), and CR (\\r).' + ); + } + /** * Split a multi-valued parameter string, like explode() * @since 1.28 diff --git a/includes/api/ApiPageSet.php b/includes/api/ApiPageSet.php index 90438d4366..ed229cb9f4 100644 --- a/includes/api/ApiPageSet.php +++ b/includes/api/ApiPageSet.php @@ -495,10 +495,14 @@ class ApiPageSet extends ApiBase { * @since 1.21 */ public function getNormalizedTitlesAsResult( $result = null ) { + global $wgContLang; + $values = []; foreach ( $this->getNormalizedTitles() as $rawTitleStr => $titleStr ) { + $encode = ( $wgContLang->normalize( $rawTitleStr ) !== $rawTitleStr ); $values[] = [ - 'from' => $rawTitleStr, + 'fromencoded' => $encode, + 'from' => $encode ? rawurlencode( $rawTitleStr ) : $rawTitleStr, 'to' => $titleStr ]; } @@ -1403,6 +1407,23 @@ class ApiPageSet extends ApiBase { return $result; } + protected function handleParamNormalization( $paramName, $value, $rawValue ) { + parent::handleParamNormalization( $paramName, $value, $rawValue ); + + if ( $paramName === 'titles' ) { + // For the 'titles' parameter, we want to split it like ApiBase would + // and add any changed titles to $this->mNormalizedTitles + $value = $this->explodeMultiValue( $value, self::LIMIT_SML2 + 1 ); + $l = count( $value ); + $rawValue = $this->explodeMultiValue( $rawValue, $l ); + for ( $i = 0; $i < $l; $i++ ) { + if ( $value[$i] !== $rawValue[$i] ) { + $this->mNormalizedTitles[$rawValue[$i]] = $value[$i]; + } + } + } + } + private static $generators = null; /** diff --git a/includes/api/i18n/en.json b/includes/api/i18n/en.json index 68147c22e3..a68a87f764 100644 --- a/includes/api/i18n/en.json +++ b/includes/api/i18n/en.json @@ -1481,7 +1481,7 @@ "api-help-param-deprecated": "Deprecated.", "api-help-param-required": "This parameter is required.", "api-help-datatypes-header": "Data types", - "api-help-datatypes": "Some parameter types in API requests need further explanation:\n;boolean\n:Boolean parameters work like HTML checkboxes: if the parameter is specified, regardless of value, it is considered true. For a false value, omit the parameter entirely.\n;timestamp\n:Timestamps may be specified in several formats. ISO 8601 date and time is recommended. All times are in UTC, any included timezone is ignored.\n:* ISO 8601 date and time, 2001-01-15T14:56:00Z (punctuation and Z are optional)\n:* ISO 8601 date and time with (ignored) fractional seconds, 2001-01-15T14:56:00.00001Z (dashes, colons, and Z are optional)\n:* MediaWiki format, 20010115145600\n:* Generic numeric format, 2001-01-15 14:56:00 (optional timezone of GMT, +##, or -## is ignored)\n:* EXIF format, 2001:01:15 14:56:00\n:*RFC 2822 format (timezone may be omitted), Mon, 15 Jan 2001 14:56:00\n:* RFC 850 format (timezone may be omitted), Monday, 15-Jan-2001 14:56:00\n:* C ctime format, Mon Jan 15 14:56:00 2001\n:* Seconds since 1970-01-01T00:00:00Z as a 1 to 13 digit integer (excluding 0)\n:* The string now\n;alternative multiple-value separator\n:Parameters that take multiple values are normally submitted with the values separated using the pipe character, e.g. param=value1|value2 or param=value1%7Cvalue2. If a value must contain the pipe character, use U+001F (Unit Separator) as the separator ''and'' prefix the value with U+001F, e.g. param=%1Fvalue1%1Fvalue2.", + "api-help-datatypes": "Input to MediaWiki should be NFC-normalized UTF-8. MediaWiki may attempt to convert other input, but this may cause some operations (such as [[Special:ApiHelp/edit|edits]] with MD5 checks) to fail.\n\nSome parameter types in API requests need further explanation:\n;boolean\n:Boolean parameters work like HTML checkboxes: if the parameter is specified, regardless of value, it is considered true. For a false value, omit the parameter entirely.\n;timestamp\n:Timestamps may be specified in several formats. ISO 8601 date and time is recommended. All times are in UTC, any included timezone is ignored.\n:* ISO 8601 date and time, 2001-01-15T14:56:00Z (punctuation and Z are optional)\n:* ISO 8601 date and time with (ignored) fractional seconds, 2001-01-15T14:56:00.00001Z (dashes, colons, and Z are optional)\n:* MediaWiki format, 20010115145600\n:* Generic numeric format, 2001-01-15 14:56:00 (optional timezone of GMT, +##, or -## is ignored)\n:* EXIF format, 2001:01:15 14:56:00\n:*RFC 2822 format (timezone may be omitted), Mon, 15 Jan 2001 14:56:00\n:* RFC 850 format (timezone may be omitted), Monday, 15-Jan-2001 14:56:00\n:* C ctime format, Mon Jan 15 14:56:00 2001\n:* Seconds since 1970-01-01T00:00:00Z as a 1 to 13 digit integer (excluding 0)\n:* The string now\n;alternative multiple-value separator\n:Parameters that take multiple values are normally submitted with the values separated using the pipe character, e.g. param=value1|value2 or param=value1%7Cvalue2. If a value must contain the pipe character, use U+001F (Unit Separator) as the separator ''and'' prefix the value with U+001F, e.g. param=%1Fvalue1%1Fvalue2.", "api-help-param-type-limit": "Type: integer or max", "api-help-param-type-integer": "Type: {{PLURAL:$1|1=integer|2=list of integers}}", "api-help-param-type-boolean": "Type: boolean ([[Special:ApiHelp/main#main/datatypes|details]])", diff --git a/tests/phpunit/includes/api/ApiBaseTest.php b/tests/phpunit/includes/api/ApiBaseTest.php index ec9d6990b9..8b75d56281 100644 --- a/tests/phpunit/includes/api/ApiBaseTest.php +++ b/tests/phpunit/includes/api/ApiBaseTest.php @@ -72,6 +72,12 @@ class ApiBaseTest extends ApiTestCase { } public static function provideGetParameterFromSettings() { + $warnings = [ + 'The value passed for \'foo\' contains invalid or non-normalized data. Textual data should ' . + 'be valid, NFC-normalized Unicode without C0 control characters other than ' . + 'HT (\\t), LF (\\n), and CR (\\r).' + ]; + $c0 = ''; $enc = ''; for ( $i = 0; $i < 32; $i++ ) { @@ -83,6 +89,7 @@ class ApiBaseTest extends ApiTestCase { return [ 'Basic param' => [ 'bar', null, 'bar', [] ], + 'Basic param, C0 controls' => [ $c0, null, $enc, $warnings ], 'String param' => [ 'bar', '', 'bar', [] ], 'String param, defaulted' => [ null, '', '', [] ], 'String param, empty' => [ '', 'default', '', [] ], @@ -108,13 +115,13 @@ class ApiBaseTest extends ApiTestCase { $c0, [ ApiBase::PARAM_ISMULTI => true ], [ $enc ], - [] + $warnings ], 'Multi-valued parameter, other C0 controls (2)' => [ "\x1f" . $c0, [ ApiBase::PARAM_ISMULTI => true ], [ substr( $enc, 0, -3 ), '' ], - [] + $warnings ], ]; } diff --git a/tests/phpunit/includes/api/ApiPageSetTest.php b/tests/phpunit/includes/api/ApiPageSetTest.php index 367210a2e0..ad1deee5ce 100644 --- a/tests/phpunit/includes/api/ApiPageSetTest.php +++ b/tests/phpunit/includes/api/ApiPageSetTest.php @@ -75,4 +75,25 @@ class ApiPageSetTest extends ApiTestCase { return [ $target, $pageSet ]; } + + public function testHandleNormalization() { + $context = new RequestContext(); + $context->setRequest( new FauxRequest( [ 'titles' => "a|B|a\xcc\x8a" ] ) ); + $main = new ApiMain( $context ); + $pageSet = new ApiPageSet( $main ); + $pageSet->execute(); + + $this->assertSame( + [ 0 => [ 'A' => -1, 'B' => -2, 'Å' => -3 ] ], + $pageSet->getAllTitlesByNamespace() + ); + $this->assertSame( + [ + [ 'fromencoded' => true, 'from' => 'a%CC%8A', 'to' => 'å' ], + [ 'fromencoded' => false, 'from' => 'a', 'to' => 'A' ], + [ 'fromencoded' => false, 'from' => 'å', 'to' => 'Å' ], + ], + $pageSet->getNormalizedTitlesAsResult() + ); + } } diff --git a/tests/phpunit/includes/api/query/ApiQueryTest.php b/tests/phpunit/includes/api/query/ApiQueryTest.php index 504b16afd4..8cb2327dfb 100644 --- a/tests/phpunit/includes/api/query/ApiQueryTest.php +++ b/tests/phpunit/includes/api/query/ApiQueryTest.php @@ -43,6 +43,7 @@ class ApiQueryTest extends ApiTestCase { $this->assertEquals( [ + 'fromencoded' => false, 'from' => 'Project:articleA', 'to' => $to->getPrefixedText(), ], @@ -51,6 +52,7 @@ class ApiQueryTest extends ApiTestCase { $this->assertEquals( [ + 'fromencoded' => false, 'from' => 'article_B', 'to' => 'Article B' ],