Follow-up r62648: Add new message key to maintenance script
[lhc/web/wiklou.git] / maintenance / language / generateNormalizerData.php
1 <?php
2
3 require_once( dirname( __FILE__ ) . '/../Maintenance.php' );
4
5 require_once( dirname( __FILE__ ) . '/../../includes/normal/UtfNormalUtil.php' );
6
7 /**
8 * Generates normalizer data files for Arabic and Malayalam.
9 * For NFC see includes/normal.
10 */
11 class GenerateNormalizerData extends Maintenance {
12 var $dataFile;
13
14 public function __construct() {
15 parent::__construct();
16 $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
17 'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
18 }
19
20 public function execute() {
21 if ( !$this->hasOption( 'unicode-data-file' ) ) {
22 $this->dataFile = 'UnicodeData.txt';
23 if ( !file_exists( $this->dataFile ) ) {
24 $this->error( "Unable to find UnicodeData.txt. Please specify its location with --unicode-data-file=<FILE>" );
25 exit( 1 );
26 }
27 } else {
28 $this->dataFile = $this->getOption( 'unicode-data-file' );
29 if ( !file_exists( $this->dataFile ) ) {
30 $this->error( 'Unable to find the specified data file.' );
31 exit( 1 );
32 }
33 }
34
35 $this->generateArabic();
36 $this->generateMalayalam();
37 }
38
39 function generateArabic() {
40 $file = fopen( $this->dataFile, 'r' );
41 if ( !$file ) {
42 $this->error( 'Unable to open the data file.' );
43 exit( 1 );
44 }
45
46 // For the file format, see http://www.unicode.org/reports/tr44/
47 $fieldNames = array(
48 'Code',
49 'Name',
50 'General_Category',
51 'Canonical_Combining_Class',
52 'Bidi_Class',
53 'Decomposition_Type_Mapping',
54 'Numeric_Type_Value',
55 'Bidi_Mirrored',
56 'Unicode_1_Name',
57 'ISO_Comment',
58 'Simple_Uppercase_Mapping',
59 'Simple_Lowercase_Mapping',
60 'Simple_Titlecase_Mapping'
61 );
62
63 $pairs = array();
64
65 $lineNum = 0;
66 while ( false !== ( $line = fgets( $file ) ) ) {
67 ++$lineNum;
68
69 # Strip comments
70 $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
71 if ( $line === '' ) {
72 continue;
73 }
74
75 # Split fields
76 $numberedData = explode( ';', $line );
77 $data = array();
78 foreach ( $fieldNames as $number => $name ) {
79 $data[$name] = $numberedData[$number];
80 }
81
82 $code = base_convert( $data['Code'], 16, 10 );
83 if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
84 || ( $code >= 0xFE70 && $code <= 0xFEFF ) ) # Arabic presentation forms B
85 {
86 if ( $data['Decomposition_Type_Mapping'] === '' ) {
87 // No decomposition
88 continue;
89 }
90 if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
91 $data['Decomposition_Type_Mapping'], $m ) )
92 {
93 $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
94 $this->error( $line );
95 continue;
96 }
97
98 $source = hexSequenceToUtf8( $data['Code'] );
99 $dest = hexSequenceToUtf8( $m[2] );
100 $pairs[$source] = $dest;
101 }
102 }
103
104 global $IP;
105 file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
106 echo "ar: " . count( $pairs ) . " pairs written.\n";
107 }
108
109 function generateMalayalam() {
110 $hexPairs = array(
111 # From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
112 '0D23 0D4D 200D' => '0D7A',
113 '0D28 0D4D 200D' => '0D7B',
114 '0D30 0D4D 200D' => '0D7C',
115 '0D32 0D4D 200D' => '0D7D',
116 '0D33 0D4D 200D' => '0D7E',
117
118 # From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
119 '0D15 0D4D 200D' => '0D7F',
120 );
121
122 $pairs = array();
123 foreach ( $hexPairs as $hexSource => $hexDest ) {
124 $source = hexSequenceToUtf8( $hexSource );
125 $dest = hexSequenceToUtf8( $hexDest );
126 $pairs[$source] = $dest;
127 }
128
129 global $IP;
130 file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
131 echo "ml: " . count( $pairs ) . " pairs written.\n";
132 }
133 }
134
135 $maintClass = 'GenerateNormalizerData';
136 require_once( DO_MAINTENANCE );
137