<?php
+/**
+ * Maintenance script to generate first letter data files for Collation.php.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @ingroup Maintenance
+ * @file
+ */
-require_once( dirname( __FILE__ ) .'/../Maintenance.php' );
+require_once( __DIR__ .'/../Maintenance.php' );
/**
* Generate first letter data files for Collation.php
/** The primary weights, indexed by codepoint */
var $weights;
- /**
+ /**
* A hashtable keyed by codepoint, where presence indicates that a character
* has a decomposition mapping. This makes it non-preferred for group header
* selection.
public function __construct() {
parent::__construct();
$this->addOption( 'data-dir', 'A directory on the local filesystem ' .
- 'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
+ 'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
false, true );
$this->addOption( 'debug-output', 'Filename for sending debug output to',
false, true );
}
function charCallback( $data ) {
- // Skip non-printable characters
+ // Skip non-printable characters,
+ // but do not skip a normal space (U+0020) since
+ // people like to use that as a fake no header symbol.
$category = substr( $data['gc'], 0, 1 );
- if ( strpos( 'LNPS', $category ) === false ) {
+ if ( strpos( 'LNPS', $category ) === false
+ && $data['cp'] !== '0020' ) {
return;
}
$cp = hexdec( $data['cp'] );
return;
}
- // Skip the composed Hangul syllables, we will use the bare Jamo
+ // Skip the composed Hangul syllables, we will use the bare Jamo
// as first letters
if ( $data['block'] == 'Hangul Syllables' ) {
return;
}
$this->weights[$cp] = $primary;
if ( $tertiary === '.0008'
- || $tertiary === '.000E' )
+ || $tertiary === '.000E' )
{
$goodTertiaryChars[$cp] = true;
}
}
// If one character has a given primary weight sequence, and a second
- // character has a longer primary weight sequence with an initial
- // portion equal to the first character, then remove the second
+ // character has a longer primary weight sequence with an initial
+ // portion equal to the first character, then remove the second
// character. This avoids having characters like U+A732 (double A)
// polluting the basic latin sort area.
- $prevWeights = array();
+
foreach ( $this->groups as $weight => $group ) {
if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) {
if ( isset( $this->groups[$m[1]] ) ) {
while ( $this->xml->name !== 'ucd' && $this->xml->read() );
$this->xml->read();
return $this->xml;
- }
+ }
/**
- * Read the attributes of the current element node and return them
+ * Read the attributes of the current element node and return them
* as an array
+ * @return array
*/
protected function readAttributes() {
$attrs = array();
}
$maintClass = 'GenerateCollationData';
-require_once( DO_MAINTENANCE );
-
+require_once( RUN_MAINTENANCE_IF_MAIN );