Title: Add byte class to unicode class conversion for js
authorTimo Tijhof <krinklemail@gmail.com>
Fri, 30 Aug 2013 21:00:40 +0000 (14:00 -0700)
committerTimo Tijhof <krinklemail@gmail.com>
Fri, 27 Sep 2013 17:21:14 +0000 (19:21 +0200)
The upcoming rewrite of mw.Title needs to use wgLegalTitleChars,
but for that to work, it needs to be converted into something
that can work in javascript.

Signed-off-by: Timo Tijhof <krinklemail@gmail.com>
Signed-off-by: David Chan <david@sheetmusic.org.uk>
Change-Id: I163f3d7e3a680d52640a93f4bd195d8209669918

includes/Title.php
includes/resourceloader/ResourceLoaderStartUpModule.php
tests/phpunit/includes/TitleTest.php

index 13350cf..cdbebf1 100644 (file)
@@ -491,6 +491,108 @@ class Title {
                return $rxTc;
        }
 
+       /**
+        * Utility method for converting a character sequence from bytes to Unicode.
+        *
+        * Primary usecase being converting $wgLegalTitleChars to a sequence usable in
+        * javascript, as PHP uses UTF-8 bytes where javascript uses Unicode code units.
+        *
+        * @param string $byteClass
+        * @return string
+        */
+       public static function convertByteClassToUnicodeClass( $byteClass ) {
+               $length = strlen( $byteClass );
+               // Input token queue
+               $x0 = $x1 = $x2 = '';
+               // Decoded queue
+               $d0 = $d1 = $d2 = '';
+               // Decoded integer codepoints
+               $ord0 = $ord1 = $ord2 = 0;
+               // Re-encoded queue
+               $r0 = $r1 = $r2 = '';
+               // Output
+               $out = '';
+               // Flags
+               $allowUnicode = false;
+               for ( $pos = 0; $pos < $length; $pos++ ) {
+                       // Shift the queues down
+                       $x2 = $x1;
+                       $x1 = $x0;
+                       $d2 = $d1;
+                       $d1 = $d0;
+                       $ord2 = $ord1;
+                       $ord1 = $ord0;
+                       $r2 = $r1;
+                       $r1 = $r0;
+                       // Load the current input token and decoded values
+                       $inChar = $byteClass[$pos];
+                       if ( $inChar == '\\' ) {
+                               if ( preg_match( '/x([0-9a-fA-F]{2})/A', $byteClass, $m, 0, $pos + 1 ) ) {
+                                       $x0 = $inChar . $m[0];
+                                       $d0 = chr( hexdec( $m[1] ) );
+                                       $pos += strlen( $m[0] );
+                               } elseif ( preg_match( '/[0-7]{3}/A', $byteClass, $m, 0, $pos + 1 ) ) {
+                                       $x0 = $inChar . $m[0];
+                                       $d0 = chr( octdec( $m[0] ) );
+                                       $pos += strlen( $m[0] );
+                               } elseif ( $pos + 1 >= $length ) {
+                                       $x0 = $d0 = '\\';
+                               } else {
+                                       $d0 = $byteClass[$pos + 1];
+                                       $x0 = $inChar . $d0;
+                                       $pos += 1;
+                               }
+                       } else {
+                               $x0 = $d0 = $inChar;
+                       }
+                       $ord0 = ord( $d0 );
+                       // Load the current re-encoded value
+                       if ( $ord0 < 32 || $ord0 == 0x7f ) {
+                               $r0 = sprintf( '\x%02x', $ord0 );
+                       } elseif ( $ord0 >= 0x80 ) {
+                               // Allow unicode if a single high-bit character appears
+                               $r0 = sprintf( '\x%02x', $ord0 );
+                               $allowUnicode = true;
+                       } elseif ( strpos( '-\\[]^', $d0 ) !== false ) {
+                               $r0 = '\\' . $d0;
+                       } else {
+                               $r0 = $d0;
+                       }
+                       // Do the output
+                       if ( $x0 !== '' && $x1 === '-' && $x2 !== '' ) {
+                               // Range
+                               if ( $ord2 > $ord0 ) {
+                                       // Empty range
+                               } elseif ( $ord0 >= 0x80 ) {
+                                       // Unicode range
+                                       $allowUnicode = true;
+                                       if ( $ord2 < 0x80 ) {
+                                               // Keep the non-unicode section of the range
+                                               $out .= "$r2-\\x7F";
+                                       }
+                               } else {
+                                       // Normal range
+                                       $out .= "$r2-$r0";
+                               }
+                               // Reset state to the initial value
+                               $x0 = $x1 = $d0 = $d1 = $r0 = $r1 = '';
+                       } elseif ( $ord2 < 0x80 ) {
+                               // ASCII character
+                               $out .= $r2;
+                       }
+               }
+               if ( $ord1 < 0x80 ) {
+                       $out .= $r1;
+               }
+               if ( $ord0 < 0x80 ) {
+                       $out .= $r0;
+               }
+               if ( $allowUnicode ) {
+                       $out .= '\u0080-\uFFFF';
+               }
+               return $out;
+       }
+
        /**
         * Get a string representation of a title suitable for
         * including in a search index
index 861ff18..7d13e7b 100644 (file)
@@ -95,6 +95,7 @@ class ResourceLoaderStartUpModule extends ResourceLoaderModule {
                        'wgCookiePrefix' => $wgCookiePrefix,
                        'wgResourceLoaderMaxQueryLength' => $wgResourceLoaderMaxQueryLength,
                        'wgCaseSensitiveNamespaces' => $caseSensitiveNamespaces,
+                       'wgLegalTitleChars' => Title::convertByteClassToUnicodeClass( Title::legalChars() ),
                );
 
                wfRunHooks( 'ResourceLoaderGetConfigVars', array( &$vars ) );
index 33bd8d6..73786b9 100644 (file)
@@ -32,6 +32,74 @@ class TitleTest extends MediaWikiTestCase {
                }
        }
 
+       public static function provideConvertByteClassToUnicodeClass() {
+               return array(
+                       array(
+                               ' %!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+',
+                               ' %!"$&\'()*,\\-./0-9:;=?@A-Z\\\\\\^_`a-z~+\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTYf-\\xFF+',
+                               'QWERTYf-\\x7F+\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTY\\x66-\\xFD+',
+                               'QWERTYf-\\x7F+\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTYf-y+',
+                               'QWERTYf-y+',
+                       ),
+                       array(
+                               'QWERTYf-\\x80+',
+                               'QWERTYf-\\x7F+\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTY\\x66-\\x80+\\x23',
+                               'QWERTYf-\\x7F+#\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTY\\x66-\\x80+\\xD3',
+                               'QWERTYf-\\x7F+\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               '\\\\\\x99',
+                               '\\\\\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               '-\\x99',
+                               '\\-\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               'QWERTY\\-\\x99',
+                               'QWERTY\\-\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               '\\\\x99',
+                               '\\\\x99',
+                       ),
+                       array(
+                               'A-\\x9F',
+                               'A-\\x7F\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               '\\x66-\\x77QWERTY\\x88-\\x91FXZ',
+                               'f-wQWERTYFXZ\\u0080-\\uFFFF',
+                       ),
+                       array(
+                               '\\x66-\\x99QWERTY\\xAA-\\xEEFXZ',
+                               'f-\\x7FQWERTYFXZ\\u0080-\\uFFFF',
+                       ),
+               );
+       }
+
+       /**
+        * @dataProvider provideConvertByteClassToUnicodeClass
+        */
+       function testConvertByteClassToUnicodeClass( $byteClass, $unicodeClass ) {
+               $this->assertEquals( $unicodeClass, Title::convertByteClassToUnicodeClass( $byteClass ) );
+       }
+
        /**
         * @dataProvider provideBug31100
         */