Port BCP47 formatter from PHP to JavaScript
authorNiklas Laxström <niklas.laxstrom@gmail.com>
Thu, 7 Sep 2017 12:11:01 +0000 (14:11 +0200)
committerNikerabbit <niklas.laxstrom@gmail.com>
Tue, 12 Sep 2017 11:07:31 +0000 (11:07 +0000)
It can be accessed via mw.language.bcp47.
To be used in ContentTranslation, see T157212

Change-Id: I37d32ab8a55c7101b903b03869899f00a39afd11

includes/GlobalFunctions.php
resources/src/mediawiki.language/mediawiki.language.js
tests/qunit/suites/resources/mediawiki/mediawiki.language.test.js

index 49159ed..799e3fc 100644 (file)
@@ -3403,6 +3403,7 @@ function wfShorthandToInteger( $string = '', $default = -1 ) {
 /**
  * Get the normalised IETF language tag
  * See unit test for examples.
+ * See mediawiki.language.bcp47 for the JavaScript implementation.
  *
  * @param string $code The language code.
  * @return string The language code which complying with BCP 47 standards.
index 3726a68..6a52434 100644 (file)
 
                setSpecialCharacters: function ( data ) {
                        this.specialCharacters = data;
+               },
+
+               /**
+                * Formats language tags according the BCP47 standard.
+                * See wfBCP47 for the PHP implementation.
+                *
+                * @param {string} languageTag Well-formed language tag
+                * @return {string}
+                */
+               bcp47: function ( languageTag ) {
+                       var formatted,
+                               isFirstSegment = true,
+                               isPrivate = false,
+                               segments = languageTag.split( '-' );
+
+                       formatted = segments.map( function ( segment ) {
+                               var newSegment;
+
+                               // when previous segment is x, it is a private segment and should be lc
+                               if ( isPrivate ) {
+                                       newSegment = segment.toLowerCase();
+                               // ISO 3166 country code
+                               } else if ( segment.length === 2 && !isFirstSegment ) {
+                                       newSegment = segment.toUpperCase();
+                               // ISO 15924 script code
+                               } else if ( segment.length === 4 && !isFirstSegment ) {
+                                       newSegment = segment.charAt( 0 ).toUpperCase() + segment.substring( 1 ).toLowerCase();
+                               // Use lowercase for other cases
+                               } else {
+                                       newSegment = segment.toLowerCase();
+                               }
+
+                               isPrivate = segment.toLowerCase() === 'x';
+                               isFirstSegment = false;
+
+                               return newSegment;
+                       } );
+
+                       return formatted.join( '-' );
                }
        } );
 
index b965079..5ce61ea 100644 (file)
@@ -1,7 +1,7 @@
 ( function ( mw, $ ) {
        'use strict';
 
-       var grammarTests;
+       var grammarTests, bcp47Tests;
 
        QUnit.module( 'mediawiki.language', QUnit.newMwEnvironment( {
                setup: function () {
                assert.equal( mw.language.listToText( [ 'a', 'b' ] ), 'a and b', 'Two items' );
                assert.equal( mw.language.listToText( [ 'a', 'b', 'c' ] ), 'a, b and c', 'More than two items' );
        } );
+
+       bcp47Tests = [
+               // Extracted from BCP 47 (list not exhaustive)
+               // # 2.1.1
+               [ 'en-ca-x-ca', 'en-CA-x-ca' ],
+               [ 'sgn-be-fr', 'sgn-BE-FR' ],
+               [ 'az-latn-x-latn', 'az-Latn-x-latn' ],
+               // # 2.2
+               [ 'sr-Latn-RS', 'sr-Latn-RS' ],
+               [ 'az-arab-ir', 'az-Arab-IR' ],
+
+               // # 2.2.5
+               [ 'sl-nedis', 'sl-nedis' ],
+               [ 'de-ch-1996', 'de-CH-1996' ],
+
+               // # 2.2.6
+               [
+                       'en-latn-gb-boont-r-extended-sequence-x-private',
+                       'en-Latn-GB-boont-r-extended-sequence-x-private'
+               ],
+
+               // Examples from BCP 47 Appendix A
+               // # Simple language subtag:
+               [ 'DE', 'de' ],
+               [ 'fR', 'fr' ],
+               [ 'ja', 'ja' ],
+
+               // # Language subtag plus script subtag:
+               [ 'zh-hans', 'zh-Hans' ],
+               [ 'sr-cyrl', 'sr-Cyrl' ],
+               [ 'sr-latn', 'sr-Latn' ],
+
+               // # Extended language subtags and their primary language subtag
+               // # counterparts:
+               [ 'zh-cmn-hans-cn', 'zh-cmn-Hans-CN' ],
+               [ 'cmn-hans-cn', 'cmn-Hans-CN' ],
+               [ 'zh-yue-hk', 'zh-yue-HK' ],
+               [ 'yue-hk', 'yue-HK' ],
+
+               // # Language-Script-Region:
+               [ 'zh-hans-cn', 'zh-Hans-CN' ],
+               [ 'sr-latn-RS', 'sr-Latn-RS' ],
+
+               // # Language-Variant:
+               [ 'sl-rozaj', 'sl-rozaj' ],
+               [ 'sl-rozaj-biske', 'sl-rozaj-biske' ],
+               [ 'sl-nedis', 'sl-nedis' ],
+
+               // # Language-Region-Variant:
+               [ 'de-ch-1901', 'de-CH-1901' ],
+               [ 'sl-it-nedis', 'sl-IT-nedis' ],
+
+               // # Language-Script-Region-Variant:
+               [ 'hy-latn-it-arevela', 'hy-Latn-IT-arevela' ],
+
+               // # Language-Region:
+               [ 'de-de', 'de-DE' ],
+               [ 'en-us', 'en-US' ],
+               [ 'es-419', 'es-419' ],
+
+               // # Private use subtags:
+               [ 'de-ch-x-phonebk', 'de-CH-x-phonebk' ],
+               [ 'az-arab-x-aze-derbend', 'az-Arab-x-aze-derbend' ],
+               /**
+                * Previous test does not reflect the BCP 47 which states:
+                *  az-Arab-x-AZE-derbend
+                * AZE being private, it should be lower case, hence the test above
+                * should probably be:
+                * [ 'az-arab-x-aze-derbend', 'az-Arab-x-AZE-derbend' ],
+                */
+
+               // # Private use registry values:
+               [ 'x-whatever', 'x-whatever' ],
+               [ 'qaa-qaaa-qm-x-southern', 'qaa-Qaaa-QM-x-southern' ],
+               [ 'de-qaaa', 'de-Qaaa' ],
+               [ 'sr-latn-qm', 'sr-Latn-QM' ],
+               [ 'sr-qaaa-rs', 'sr-Qaaa-RS' ],
+
+               // # Tags that use extensions
+               [ 'en-us-u-islamcal', 'en-US-u-islamcal' ],
+               [ 'zh-cn-a-myext-x-private', 'zh-CN-a-myext-x-private' ],
+               [ 'en-a-myext-b-another', 'en-a-myext-b-another' ]
+
+               // # Invalid:
+               // de-419-DE
+               // a-DE
+               // ar-a-aaa-b-bbb-a-ccc
+       ];
+
+       QUnit.test( 'mw.language.bcp47', function ( assert ) {
+               bcp47Tests.forEach( function ( data ) {
+                       var input = data[ 0 ],
+                               expected = data[ 1 ];
+                       assert.equal( mw.language.bcp47( input ), expected );
+               } );
+       } );
 }( mediaWiki, jQuery ) );