From 990c1f0ad277d3e8ce72b6885b64aa2cdf100fe4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Niklas=20Laxstr=C3=B6m?= Date: Thu, 7 Sep 2017 14:11:01 +0200 Subject: [PATCH] Port BCP47 formatter from PHP to JavaScript It can be accessed via mw.language.bcp47. To be used in ContentTranslation, see T157212 Change-Id: I37d32ab8a55c7101b903b03869899f00a39afd11 --- includes/GlobalFunctions.php | 1 + .../mediawiki.language/mediawiki.language.js | 39 ++++++++ .../mediawiki/mediawiki.language.test.js | 98 ++++++++++++++++++- 3 files changed, 137 insertions(+), 1 deletion(-) diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php index 49159ed3df..799e3fc22e 100644 --- a/includes/GlobalFunctions.php +++ b/includes/GlobalFunctions.php @@ -3403,6 +3403,7 @@ function wfShorthandToInteger( $string = '', $default = -1 ) { /** * Get the normalised IETF language tag * See unit test for examples. + * See mediawiki.language.bcp47 for the JavaScript implementation. * * @param string $code The language code. * @return string The language code which complying with BCP 47 standards. diff --git a/resources/src/mediawiki.language/mediawiki.language.js b/resources/src/mediawiki.language/mediawiki.language.js index 3726a68527..6a5243403c 100644 --- a/resources/src/mediawiki.language/mediawiki.language.js +++ b/resources/src/mediawiki.language/mediawiki.language.js @@ -188,6 +188,45 @@ setSpecialCharacters: function ( data ) { this.specialCharacters = data; + }, + + /** + * Formats language tags according the BCP47 standard. + * See wfBCP47 for the PHP implementation. + * + * @param {string} languageTag Well-formed language tag + * @return {string} + */ + bcp47: function ( languageTag ) { + var formatted, + isFirstSegment = true, + isPrivate = false, + segments = languageTag.split( '-' ); + + formatted = segments.map( function ( segment ) { + var newSegment; + + // when previous segment is x, it is a private segment and should be lc + if ( isPrivate ) { + newSegment = segment.toLowerCase(); + // ISO 3166 country code + } else if ( segment.length === 2 && !isFirstSegment ) { + newSegment = segment.toUpperCase(); + // ISO 15924 script code + } else if ( segment.length === 4 && !isFirstSegment ) { + newSegment = segment.charAt( 0 ).toUpperCase() + segment.substring( 1 ).toLowerCase(); + // Use lowercase for other cases + } else { + newSegment = segment.toLowerCase(); + } + + isPrivate = segment.toLowerCase() === 'x'; + isFirstSegment = false; + + return newSegment; + } ); + + return formatted.join( '-' ); } } ); diff --git a/tests/qunit/suites/resources/mediawiki/mediawiki.language.test.js b/tests/qunit/suites/resources/mediawiki/mediawiki.language.test.js index b9650796ef..5ce61ea752 100644 --- a/tests/qunit/suites/resources/mediawiki/mediawiki.language.test.js +++ b/tests/qunit/suites/resources/mediawiki/mediawiki.language.test.js @@ -1,7 +1,7 @@ ( function ( mw, $ ) { 'use strict'; - var grammarTests; + var grammarTests, bcp47Tests; QUnit.module( 'mediawiki.language', QUnit.newMwEnvironment( { setup: function () { @@ -587,4 +587,100 @@ assert.equal( mw.language.listToText( [ 'a', 'b' ] ), 'a and b', 'Two items' ); assert.equal( mw.language.listToText( [ 'a', 'b', 'c' ] ), 'a, b and c', 'More than two items' ); } ); + + bcp47Tests = [ + // Extracted from BCP 47 (list not exhaustive) + // # 2.1.1 + [ 'en-ca-x-ca', 'en-CA-x-ca' ], + [ 'sgn-be-fr', 'sgn-BE-FR' ], + [ 'az-latn-x-latn', 'az-Latn-x-latn' ], + // # 2.2 + [ 'sr-Latn-RS', 'sr-Latn-RS' ], + [ 'az-arab-ir', 'az-Arab-IR' ], + + // # 2.2.5 + [ 'sl-nedis', 'sl-nedis' ], + [ 'de-ch-1996', 'de-CH-1996' ], + + // # 2.2.6 + [ + 'en-latn-gb-boont-r-extended-sequence-x-private', + 'en-Latn-GB-boont-r-extended-sequence-x-private' + ], + + // Examples from BCP 47 Appendix A + // # Simple language subtag: + [ 'DE', 'de' ], + [ 'fR', 'fr' ], + [ 'ja', 'ja' ], + + // # Language subtag plus script subtag: + [ 'zh-hans', 'zh-Hans' ], + [ 'sr-cyrl', 'sr-Cyrl' ], + [ 'sr-latn', 'sr-Latn' ], + + // # Extended language subtags and their primary language subtag + // # counterparts: + [ 'zh-cmn-hans-cn', 'zh-cmn-Hans-CN' ], + [ 'cmn-hans-cn', 'cmn-Hans-CN' ], + [ 'zh-yue-hk', 'zh-yue-HK' ], + [ 'yue-hk', 'yue-HK' ], + + // # Language-Script-Region: + [ 'zh-hans-cn', 'zh-Hans-CN' ], + [ 'sr-latn-RS', 'sr-Latn-RS' ], + + // # Language-Variant: + [ 'sl-rozaj', 'sl-rozaj' ], + [ 'sl-rozaj-biske', 'sl-rozaj-biske' ], + [ 'sl-nedis', 'sl-nedis' ], + + // # Language-Region-Variant: + [ 'de-ch-1901', 'de-CH-1901' ], + [ 'sl-it-nedis', 'sl-IT-nedis' ], + + // # Language-Script-Region-Variant: + [ 'hy-latn-it-arevela', 'hy-Latn-IT-arevela' ], + + // # Language-Region: + [ 'de-de', 'de-DE' ], + [ 'en-us', 'en-US' ], + [ 'es-419', 'es-419' ], + + // # Private use subtags: + [ 'de-ch-x-phonebk', 'de-CH-x-phonebk' ], + [ 'az-arab-x-aze-derbend', 'az-Arab-x-aze-derbend' ], + /** + * Previous test does not reflect the BCP 47 which states: + * az-Arab-x-AZE-derbend + * AZE being private, it should be lower case, hence the test above + * should probably be: + * [ 'az-arab-x-aze-derbend', 'az-Arab-x-AZE-derbend' ], + */ + + // # Private use registry values: + [ 'x-whatever', 'x-whatever' ], + [ 'qaa-qaaa-qm-x-southern', 'qaa-Qaaa-QM-x-southern' ], + [ 'de-qaaa', 'de-Qaaa' ], + [ 'sr-latn-qm', 'sr-Latn-QM' ], + [ 'sr-qaaa-rs', 'sr-Qaaa-RS' ], + + // # Tags that use extensions + [ 'en-us-u-islamcal', 'en-US-u-islamcal' ], + [ 'zh-cn-a-myext-x-private', 'zh-CN-a-myext-x-private' ], + [ 'en-a-myext-b-another', 'en-a-myext-b-another' ] + + // # Invalid: + // de-419-DE + // a-DE + // ar-a-aaa-b-bbb-a-ccc + ]; + + QUnit.test( 'mw.language.bcp47', function ( assert ) { + bcp47Tests.forEach( function ( data ) { + var input = data[ 0 ], + expected = data[ 1 ]; + assert.equal( mw.language.bcp47( input ), expected ); + } ); + } ); }( mediaWiki, jQuery ) ); -- 2.20.1