From 8f45c9e03a568fcd7bf0478c438d6c60a943f70c Mon Sep 17 00:00:00 2001 From: Antoine Musso Date: Sun, 6 Feb 2011 14:47:35 +0000 Subject: [PATCH] bugfix for wfBCP47 and code coverage Language code are case insensitive. The BCP 47 recommands nice formatting nonetheless. This patch enhance our formatting: - tags preceded by the private tag 'x' are now lower case - 4 letters tags are now lower case with first letter uper cased Please note the RFC seems to have a bug for az-Arab-x-AZE-derbend which should be az-Arab-x-aze-derbend . I have changed our test to reflect this and added a comment for later reference. --- includes/GlobalFunctions.php | 8 +- tests/phpunit/includes/GlobalTest.php | 127 ++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 2 deletions(-) diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php index 9af9b92000..dcc3f2329d 100644 --- a/includes/GlobalFunctions.php +++ b/includes/GlobalFunctions.php @@ -3397,6 +3397,7 @@ function wfShorthandToInteger( $string = '' ) { /** * Get the normalised IETF language tag + * See unit test for examples. * @param $code String: The language code. * @return $langCode String: The language code which complying with BCP 47 standards. */ @@ -3404,12 +3405,15 @@ function wfBCP47( $code ) { $codeSegment = explode( '-', $code ); foreach ( $codeSegment as $segNo => $seg ) { if ( count( $codeSegment ) > 0 ) { + // when previous segment is x, it is a private segment and should be lc + if( $segNo > 0 && strtolower( $codeSegment[($segNo - 1)] ) == 'x') { + $codeBCP[$segNo] = strtolower( $seg ); // ISO 3166 country code - if ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) { + } elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) { $codeBCP[$segNo] = strtoupper( $seg ); // ISO 15924 script code } elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) { - $codeBCP[$segNo] = ucfirst( $seg ); + $codeBCP[$segNo] = ucfirst( strtolower( $seg ) ); // Use lowercase for other cases } else { $codeBCP[$segNo] = strtolower( $seg ); diff --git a/tests/phpunit/includes/GlobalTest.php b/tests/phpunit/includes/GlobalTest.php index 1a4918e760..a7ba796222 100644 --- a/tests/phpunit/includes/GlobalTest.php +++ b/tests/phpunit/includes/GlobalTest.php @@ -632,6 +632,133 @@ class GlobalTest extends MediaWikiTestCase { } + /** + * test @see wfBCP47(). + * Please note the BCP explicitly state that language codes are case + * insensitive, there are some exceptions to the rule :) + * This test is used to verify our formatting against all lower and + * all upper cases language code. + * + * @see http://tools.ietf.org/html/bcp47 + * @dataProvider provideLanguageCodes() + */ + function testBCP47( $code, $expected ) { + $code = strtolower( $code ); + $this->assertEquals( $expected, wfBCP47($code), + "Applying BCP47 standard to lower case '$code'" + ); + + $code = strtoupper( $code ); + $this->assertEquals( $expected, wfBCP47($code), + "Applying BCP47 standard to upper case '$code'" + ); + } + + /** + * Array format is ($code, $expected) + */ + function provideLanguageCodes() { + return array( + // Extracted from BCP47 (list not exhaustive) + # 2.1.1 + array( 'en-ca-x-ca' , 'en-CA-x-ca' ), + array( 'sgn-be-fr' , 'sgn-BE-FR' ), + array( 'az-latn-x-latn', 'az-Latn-x-latn' ), + # 2.2 + array( 'sr-Latn-RS', 'sr-Latn-RS' ), + array( 'az-arab-ir', 'az-Arab-IR' ), + + # 2.2.5 + array( 'sl-nedis' , 'sl-nedis' ), + array( 'de-ch-1996', 'de-CH-1996' ), + + # 2.2.6 + array( + 'en-latn-gb-boont-r-extended-sequence-x-private', + 'en-Latn-GB-boont-r-extended-sequence-x-private' + ), + + // Examples from BCP47 Appendix A + # Simple language subtag: + array( 'DE', 'de' ), + array( 'fR', 'fr' ), + array( 'ja', 'ja' ), + + # Language subtag plus script subtag: + array( 'zh-hans', 'zh-Hans'), + array( 'sr-cyrl', 'sr-Cyrl'), + array( 'sr-latn', 'sr-Latn'), + + # Extended language subtags and their primary language subtag + # counterparts: + array( 'zh-cmn-hans-cn', 'zh-cmn-Hans-CN' ), + array( 'cmn-hans-cn' , 'cmn-Hans-CN' ), + array( 'zh-yue-hk' , 'zh-yue-HK' ), + array( 'yue-hk' , 'yue-HK' ), + + # Language-Script-Region: + array( 'zh-hans-cn', 'zh-Hans-CN' ), + array( 'sr-latn-RS', 'sr-Latn-RS' ), + + # Language-Variant: + array( 'sl-rozaj' , 'sl-rozaj' ), + array( 'sl-rozaj-biske', 'sl-rozaj-biske' ), + array( 'sl-nedis' , 'sl-nedis' ), + + # Language-Region-Variant: + array( 'de-ch-1901' , 'de-CH-1901' ), + array( 'sl-it-nedis' , 'sl-IT-nedis' ), + + # Language-Script-Region-Variant: + array( 'hy-latn-it-arevela', 'hy-Latn-IT-arevela' ), + + # Language-Region: + array( 'de-de' , 'de-DE' ), + array( 'en-us' , 'en-US' ), + array( 'es-419', 'es-419'), + + # Private use subtags: + array( 'de-ch-x-phonebk' , 'de-CH-x-phonebk' ), + array( 'az-arab-x-aze-derbend', 'az-Arab-x-aze-derbend' ), + /** + * Previous test does not reflect the BCP which states: + * az-Arab-x-AZE-derbend + * AZE being private, it should be lower case, hence the test above + * should probably be: + #array( 'az-arab-x-aze-derbend', 'az-Arab-x-AZE-derbend' ), + */ + + # Private use registry values: + array( 'x-whatever', 'x-whatever' ), + array( 'qaa-qaaa-qm-x-southern', 'qaa-Qaaa-QM-x-southern' ), + array( 'de-qaaa' , 'de-Qaaa' ), + array( 'sr-latn-qm', 'sr-Latn-QM' ), + array( 'sr-qaaa-rs', 'sr-Qaaa-RS' ), + + # Tags that use extensions + array( 'en-us-u-islamcal', 'en-US-u-islamcal' ), + array( 'zh-cn-a-myext-x-private', 'zh-CN-a-myext-x-private' ), + array( 'en-a-myext-b-another', 'en-a-myext-b-another' ), + + # Invalid: + // de-419-DE + // a-DE + // ar-a-aaa-b-bbb-a-ccc + + /* + // ISO 15924 : + array( 'sr-Cyrl', 'sr-Cyrl' ), + array( 'SR-lATN', 'sr-Latn' ), # FIXME fix our function? + array( 'fr-latn', 'fr-Latn' ), + // Use lowercase for single segment + // ISO 3166-1-alpha-2 code + array( 'US', 'us' ), # USA + array( 'uS', 'us' ), # USA + array( 'Fr', 'fr' ), # France + array( 'va', 'va' ), # Holy See (Vatican City State) + */); + } + /* TODO: many more! */ } -- 2.20.1