return $rxTc;
}
+ /**
+ * Utility method for converting a character sequence from bytes to Unicode.
+ *
+ * Primary usecase being converting $wgLegalTitleChars to a sequence usable in
+ * javascript, as PHP uses UTF-8 bytes where javascript uses Unicode code units.
+ *
+ * @param string $byteClass
+ * @return string
+ */
+ public static function convertByteClassToUnicodeClass( $byteClass ) {
+ $length = strlen( $byteClass );
+ // Input token queue
+ $x0 = $x1 = $x2 = '';
+ // Decoded queue
+ $d0 = $d1 = $d2 = '';
+ // Decoded integer codepoints
+ $ord0 = $ord1 = $ord2 = 0;
+ // Re-encoded queue
+ $r0 = $r1 = $r2 = '';
+ // Output
+ $out = '';
+ // Flags
+ $allowUnicode = false;
+ for ( $pos = 0; $pos < $length; $pos++ ) {
+ // Shift the queues down
+ $x2 = $x1;
+ $x1 = $x0;
+ $d2 = $d1;
+ $d1 = $d0;
+ $ord2 = $ord1;
+ $ord1 = $ord0;
+ $r2 = $r1;
+ $r1 = $r0;
+ // Load the current input token and decoded values
+ $inChar = $byteClass[$pos];
+ if ( $inChar == '\\' ) {
+ if ( preg_match( '/x([0-9a-fA-F]{2})/A', $byteClass, $m, 0, $pos + 1 ) ) {
+ $x0 = $inChar . $m[0];
+ $d0 = chr( hexdec( $m[1] ) );
+ $pos += strlen( $m[0] );
+ } elseif ( preg_match( '/[0-7]{3}/A', $byteClass, $m, 0, $pos + 1 ) ) {
+ $x0 = $inChar . $m[0];
+ $d0 = chr( octdec( $m[0] ) );
+ $pos += strlen( $m[0] );
+ } elseif ( $pos + 1 >= $length ) {
+ $x0 = $d0 = '\\';
+ } else {
+ $d0 = $byteClass[$pos + 1];
+ $x0 = $inChar . $d0;
+ $pos += 1;
+ }
+ } else {
+ $x0 = $d0 = $inChar;
+ }
+ $ord0 = ord( $d0 );
+ // Load the current re-encoded value
+ if ( $ord0 < 32 || $ord0 == 0x7f ) {
+ $r0 = sprintf( '\x%02x', $ord0 );
+ } elseif ( $ord0 >= 0x80 ) {
+ // Allow unicode if a single high-bit character appears
+ $r0 = sprintf( '\x%02x', $ord0 );
+ $allowUnicode = true;
+ } elseif ( strpos( '-\\[]^', $d0 ) !== false ) {
+ $r0 = '\\' . $d0;
+ } else {
+ $r0 = $d0;
+ }
+ // Do the output
+ if ( $x0 !== '' && $x1 === '-' && $x2 !== '' ) {
+ // Range
+ if ( $ord2 > $ord0 ) {
+ // Empty range
+ } elseif ( $ord0 >= 0x80 ) {
+ // Unicode range
+ $allowUnicode = true;
+ if ( $ord2 < 0x80 ) {
+ // Keep the non-unicode section of the range
+ $out .= "$r2-\\x7F";
+ }
+ } else {
+ // Normal range
+ $out .= "$r2-$r0";
+ }
+ // Reset state to the initial value
+ $x0 = $x1 = $d0 = $d1 = $r0 = $r1 = '';
+ } elseif ( $ord2 < 0x80 ) {
+ // ASCII character
+ $out .= $r2;
+ }
+ }
+ if ( $ord1 < 0x80 ) {
+ $out .= $r1;
+ }
+ if ( $ord0 < 0x80 ) {
+ $out .= $r0;
+ }
+ if ( $allowUnicode ) {
+ $out .= '\u0080-\uFFFF';
+ }
+ return $out;
+ }
+
/**
* Get a string representation of a title suitable for
* including in a search index
}
}
+ public static function provideConvertByteClassToUnicodeClass() {
+ return array(
+ array(
+ ' %!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+',
+ ' %!"$&\'()*,\\-./0-9:;=?@A-Z\\\\\\^_`a-z~+\\u0080-\\uFFFF',
+ ),
+ array(
+ 'QWERTYf-\\xFF+',
+ 'QWERTYf-\\x7F+\\u0080-\\uFFFF',
+ ),
+ array(
+ 'QWERTY\\x66-\\xFD+',
+ 'QWERTYf-\\x7F+\\u0080-\\uFFFF',
+ ),
+ array(
+ 'QWERTYf-y+',
+ 'QWERTYf-y+',
+ ),
+ array(
+ 'QWERTYf-\\x80+',
+ 'QWERTYf-\\x7F+\\u0080-\\uFFFF',
+ ),
+ array(
+ 'QWERTY\\x66-\\x80+\\x23',
+ 'QWERTYf-\\x7F+#\\u0080-\\uFFFF',
+ ),
+ array(
+ 'QWERTY\\x66-\\x80+\\xD3',
+ 'QWERTYf-\\x7F+\\u0080-\\uFFFF',
+ ),
+ array(
+ '\\\\\\x99',
+ '\\\\\\u0080-\\uFFFF',
+ ),
+ array(
+ '-\\x99',
+ '\\-\\u0080-\\uFFFF',
+ ),
+ array(
+ 'QWERTY\\-\\x99',
+ 'QWERTY\\-\\u0080-\\uFFFF',
+ ),
+ array(
+ '\\\\x99',
+ '\\\\x99',
+ ),
+ array(
+ 'A-\\x9F',
+ 'A-\\x7F\\u0080-\\uFFFF',
+ ),
+ array(
+ '\\x66-\\x77QWERTY\\x88-\\x91FXZ',
+ 'f-wQWERTYFXZ\\u0080-\\uFFFF',
+ ),
+ array(
+ '\\x66-\\x99QWERTY\\xAA-\\xEEFXZ',
+ 'f-\\x7FQWERTYFXZ\\u0080-\\uFFFF',
+ ),
+ );
+ }
+
+ /**
+ * @dataProvider provideConvertByteClassToUnicodeClass
+ */
+ function testConvertByteClassToUnicodeClass( $byteClass, $unicodeClass ) {
+ $this->assertEquals( $unicodeClass, Title::convertByteClassToUnicodeClass( $byteClass ) );
+ }
+
/**
* @dataProvider provideBug31100
*/