From 8f45c9e03a568fcd7bf0478c438d6c60a943f70c Mon Sep 17 00:00:00 2001
From: Antoine Musso <hashar@users.mediawiki.org>
Date: Sun, 6 Feb 2011 14:47:35 +0000
Subject: [PATCH] bugfix for wfBCP47 and code coverage

Language code are case insensitive. The BCP 47 recommands nice
formatting nonetheless. This patch enhance our formatting:
- tags preceded by the private tag 'x' are now lower case
- 4 letters tags are now lower case with first letter uper cased

Please note the RFC seems to have a bug for az-Arab-x-AZE-derbend
which should be az-Arab-x-aze-derbend .  I have changed our test
to reflect this and added a comment for later reference.
---
 includes/GlobalFunctions.php          |   8 +-
 tests/phpunit/includes/GlobalTest.php | 127 ++++++++++++++++++++++++++
 2 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php
index 9af9b92000..dcc3f2329d 100644
--- a/includes/GlobalFunctions.php
+++ b/includes/GlobalFunctions.php
@@ -3397,6 +3397,7 @@ function wfShorthandToInteger( $string = '' ) {
 
 /**
  * Get the normalised IETF language tag
+ * See unit test for examples.
  * @param $code String: The language code.
  * @return $langCode String: The language code which complying with BCP 47 standards.
  */
@@ -3404,12 +3405,15 @@ function wfBCP47( $code ) {
 	$codeSegment = explode( '-', $code );
 	foreach ( $codeSegment as $segNo => $seg ) {
 		if ( count( $codeSegment ) > 0 ) {
+			// when previous segment is x, it is a private segment and should be lc 
+			if( $segNo > 0 && strtolower( $codeSegment[($segNo - 1)] ) == 'x') {
+				$codeBCP[$segNo] = strtolower( $seg );
 			// ISO 3166 country code
-			if ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) {
+			} elseif ( ( strlen( $seg ) == 2 ) && ( $segNo > 0 ) ) {
 				$codeBCP[$segNo] = strtoupper( $seg );
 			// ISO 15924 script code
 			} elseif ( ( strlen( $seg ) == 4 ) && ( $segNo > 0 ) ) {
-				$codeBCP[$segNo] = ucfirst( $seg );
+				$codeBCP[$segNo] = ucfirst( strtolower( $seg ) );
 			// Use lowercase for other cases
 			} else {
 				$codeBCP[$segNo] = strtolower( $seg );
diff --git a/tests/phpunit/includes/GlobalTest.php b/tests/phpunit/includes/GlobalTest.php
index 1a4918e760..a7ba796222 100644
--- a/tests/phpunit/includes/GlobalTest.php
+++ b/tests/phpunit/includes/GlobalTest.php
@@ -632,6 +632,133 @@ class GlobalTest extends MediaWikiTestCase {
 	
 	}
 
+	/**
+	 * test @see wfBCP47().
+	 * Please note the BCP explicitly state that language codes are case
+	 * insensitive, there are some exceptions to the rule :)
+   	 * This test is used to verify our formatting against all lower and
+	 * all upper cases language code.
+	 *
+	 * @see http://tools.ietf.org/html/bcp47
+	 * @dataProvider provideLanguageCodes()
+	 */
+	function testBCP47( $code, $expected ) {
+		$code = strtolower( $code );
+		$this->assertEquals( $expected, wfBCP47($code),
+			"Applying BCP47 standard to lower case '$code'"
+		);
+
+		$code = strtoupper( $code );
+		$this->assertEquals( $expected, wfBCP47($code),
+			"Applying BCP47 standard to upper case '$code'"
+		);
+	}
+
+	/**
+	 * Array format is ($code, $expected)
+	 */
+	function provideLanguageCodes() {
+		return array(
+			// Extracted from BCP47 (list not exhaustive)
+			# 2.1.1
+			array( 'en-ca-x-ca'    , 'en-CA-x-ca'     ),
+			array( 'sgn-be-fr'     , 'sgn-BE-FR'      ),
+			array( 'az-latn-x-latn', 'az-Latn-x-latn' ),
+			# 2.2
+			array( 'sr-Latn-RS', 'sr-Latn-RS' ),
+			array( 'az-arab-ir', 'az-Arab-IR' ),
+
+			# 2.2.5
+			array( 'sl-nedis'  , 'sl-nedis'   ),
+			array( 'de-ch-1996', 'de-CH-1996' ),
+
+			# 2.2.6
+			array(
+				'en-latn-gb-boont-r-extended-sequence-x-private',
+				'en-Latn-GB-boont-r-extended-sequence-x-private'
+			),
+
+			// Examples from BCP47 Appendix A
+			# Simple language subtag:
+			array( 'DE', 'de' ),
+			array( 'fR', 'fr' ),
+			array( 'ja', 'ja' ),
+
+			# Language subtag plus script subtag:
+			array( 'zh-hans', 'zh-Hans'),
+			array( 'sr-cyrl', 'sr-Cyrl'),
+			array( 'sr-latn', 'sr-Latn'),
+
+			# Extended language subtags and their primary language subtag
+			# counterparts:
+			array( 'zh-cmn-hans-cn', 'zh-cmn-Hans-CN' ),
+			array( 'cmn-hans-cn'   , 'cmn-Hans-CN'    ),
+			array( 'zh-yue-hk'     , 'zh-yue-HK'      ),
+			array( 'yue-hk'        , 'yue-HK'         ),
+
+			# Language-Script-Region:
+			array( 'zh-hans-cn', 'zh-Hans-CN' ),
+			array( 'sr-latn-RS', 'sr-Latn-RS' ),
+
+			# Language-Variant:
+			array( 'sl-rozaj'      , 'sl-rozaj'       ),
+			array( 'sl-rozaj-biske', 'sl-rozaj-biske' ),
+			array( 'sl-nedis'      , 'sl-nedis'       ),
+
+			# Language-Region-Variant:
+			array( 'de-ch-1901'  , 'de-CH-1901'  ),
+			array( 'sl-it-nedis' , 'sl-IT-nedis' ),
+
+			# Language-Script-Region-Variant:
+			array( 'hy-latn-it-arevela', 'hy-Latn-IT-arevela' ),
+
+			# Language-Region:
+			array( 'de-de' , 'de-DE' ),
+			array( 'en-us' , 'en-US' ),
+			array( 'es-419', 'es-419'),
+
+			# Private use subtags:
+			array( 'de-ch-x-phonebk'      , 'de-CH-x-phonebk' ),
+			array( 'az-arab-x-aze-derbend', 'az-Arab-x-aze-derbend' ),
+			/**
+			 * Previous test does not reflect the BCP which states:
+			 *  az-Arab-x-AZE-derbend
+			 * AZE being private, it should be lower case, hence the test above
+			 * should probably be:
+			#array( 'az-arab-x-aze-derbend', 'az-Arab-x-AZE-derbend' ),
+			 */
+
+			# Private use registry values:
+			array( 'x-whatever', 'x-whatever' ),
+			array( 'qaa-qaaa-qm-x-southern', 'qaa-Qaaa-QM-x-southern' ),
+			array( 'de-qaaa'   , 'de-Qaaa'    ),
+			array( 'sr-latn-qm', 'sr-Latn-QM' ),
+			array( 'sr-qaaa-rs', 'sr-Qaaa-RS' ),
+
+			# Tags that use extensions
+			array( 'en-us-u-islamcal', 'en-US-u-islamcal' ),
+			array( 'zh-cn-a-myext-x-private', 'zh-CN-a-myext-x-private' ),
+			array( 'en-a-myext-b-another', 'en-a-myext-b-another' ),
+
+			# Invalid:
+			// de-419-DE
+			// a-DE
+			// ar-a-aaa-b-bbb-a-ccc
+	
+		/*	
+			// ISO 15924 :
+			array( 'sr-Cyrl', 'sr-Cyrl' ),
+			array( 'SR-lATN', 'sr-Latn' ), # FIXME fix our function?
+			array( 'fr-latn', 'fr-Latn' ),
+			// Use lowercase for single segment
+			// ISO 3166-1-alpha-2 code
+			array( 'US', 'us' ),  # USA
+			array( 'uS', 'us' ),  # USA
+			array( 'Fr', 'fr' ),  # France
+			array( 'va', 'va' ),  # Holy See (Vatican City State)
+		 */);
+	}
+
 	/* TODO: many more! */
 }
 
-- 
2.20.1