mw.Title: Add new static methods `newFromFileName`, `newFromUserInput`
authorrillke <rainerrillke@hotmail.com>
Wed, 7 May 2014 13:17:16 +0000 (15:17 +0200)
committerTheDJ <hartman.wiki@gmail.com>
Sun, 2 Nov 2014 16:56:53 +0000 (16:56 +0000)
This is the replacement for Ia57e4b0e804.

Use cases:

- UploadWizard: Suggesting valid tiles for files supplied by user
  whose names are invalid MediaWiki titles.
- Upload Form: The same.
- Community scripts, VisualEditor upload gadget, [...]: The same.

Bug: 64912
Bug: 64922
Change-Id: I260ba883548975263bba03d25f6d11d3d9bbfbd9

resources/src/mediawiki/mediawiki.Title.js
tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js

index 95b18a8..7ced42f 100644 (file)
         */
        NS_SPECIAL = -1,
 
+       /**
+        * @private
+        * @static
+        * @property NS_MEDIA
+        */
+       NS_MEDIA = -2,
+
+       /**
+        * @private
+        * @static
+        * @property NS_FILE
+        */
+       NS_FILE = 6,
+
+       /**
+        * @private
+        * @static
+        * @property FILENAME_MAX_BYTES
+        */
+       FILENAME_MAX_BYTES = 240,
+
+       /**
+        * @private
+        * @static
+        * @property TITLE_MAX_BYTES
+        */
+       TITLE_MAX_BYTES = 255,
+
        /**
         * Get the namespace id from a namespace name (either from the localized, canonical or alias
         * name).
                '|&#x[0-9A-Fa-f]+;'
        ),
 
+       // From MediaWikiTitleCodec.php#L225 @26fcab1f18c568a41
+       // "Clean up whitespace" in function MediaWikiTitleCodec::splitTitleString()
+       rWhitespace = /[ _\u0009\u00A0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\s]+/g,
+
+       /**
+        * Slightly modified from Flinfo. Credit goes to Lupo and Flominator.
+        * @private
+        * @static
+        * @property sanitationRules
+        */
+       sanitationRules = [
+               // "signature"
+               {
+                       pattern: /~{3}/g,
+                       replace: '',
+                       generalRule: true
+               },
+               // Space, underscore, tab, NBSP and other unusual spaces
+               {
+                       pattern: rWhitespace,
+                       replace: ' ',
+                       generalRule: true
+               },
+               // unicode bidi override characters: Implicit, Embeds, Overrides
+               {
+                       pattern: /[\u200E\u200F\u202A-\u202E]/g,
+                       replace: '',
+                       generalRule: true
+               },
+               // control characters
+               {
+                       pattern: /[\x00-\x1f\x7f]/g,
+                       replace: '',
+                       generalRule: true
+               },
+               // URL encoding (possibly)
+               {
+                       pattern: /%([0-9A-Fa-f]{2})/g,
+                       replace: '% $1',
+                       generalRule: true
+               },
+               // HTML-character-entities
+               {
+                       pattern: /&(([A-Za-z0-9\x80-\xff]+|#[0-9]+|#x[0-9A-Fa-f]+);)/g,
+                       replace: '& $1',
+                       generalRule: true
+               },
+               // slash, colon (not supported by file systems like NTFS/Windows, Mac OS 9 [:], ext4 [/])
+               {
+                       pattern: /[:\/#]/g,
+                       replace: '-',
+                       fileRule: true
+               },
+               // brackets, greater than
+               {
+                       pattern: /[\]\}>]/g,
+                       replace: ')',
+                       generalRule: true
+               },
+               // brackets, lower than
+               {
+                       pattern: /[\[\{<]/g,
+                       replace: '(',
+                       generalRule: true
+               },
+               // everything that wasn't covered yet
+               {
+                       pattern: new RegExp( rInvalid.source, 'g' ),
+                       replace: '-',
+                       generalRule: true
+               },
+               // directory structures
+               {
+                       pattern: /^(\.|\.\.|\.\/.*|\.\.\/.*|.*\/\.\/.*|.*\/\.\.\/.*|.*\/\.|.*\/\.\.)$/g,
+                       replace: '',
+                       generalRule: true
+               }
+       ],
+
        /**
         * Internal helper for #constructor and #newFromtext.
         *
                        return false;
                }
 
-               // Disallow titles exceeding the 255 byte size limit (size of underlying database field)
+               // Disallow titles exceeding the TITLE_MAX_BYTES byte size limit (size of underlying database field)
                // Except for special pages, e.g. [[Special:Block/Long name]]
                // Note: The PHP implementation also asserts that even in NS_SPECIAL, the title should
                // be less than 512 bytes.
-               if ( namespace !== NS_SPECIAL && $.byteLength( title ) > 255 ) {
+               if ( namespace !== NS_SPECIAL && $.byteLength( title ) > TITLE_MAX_BYTES ) {
                        return false;
                }
 
                }
        },
 
+       /**
+        * Sanitizes a string based on a rule set and a filter
+        *
+        * @private
+        * @static
+        * @method sanitize
+        * @param {string} s
+        * @param {Array} filter
+        * @return {string}
+        */
+       sanitize = function ( s, filter ) {
+               var i, ruleLength, rule, m, filterLength,
+                       rules = sanitationRules;
+
+               for ( i = 0, ruleLength = rules.length; i < ruleLength; ++i ) {
+                       rule = rules[i];
+                       for ( m = 0, filterLength = filter.length; m < filterLength; ++m ) {
+                               if ( rule[filter[m]] ) {
+                                       s = s.replace( rule.pattern, rule.replace );
+                               }
+                       }
+               }
+               return s;
+       },
+
+       /**
+        * Cuts a string to a specific byte length, assuming UTF-8
+        * or less, if the last character is a multi-byte one
+        *
+        * @private
+        * @static
+        * @method trimToByteLength
+        * @param {string} s
+        * @param {number} length
+        * @return {string}
+        */
+       trimToByteLength = function ( s, length ) {
+               var byteLength, chopOffChars, chopOffBytes;
+
+               // bytelength is always greater or equal to the length in characters
+               s = s.substr( 0, length );
+               while ( ( byteLength = $.byteLength( s ) ) > length ) {
+                       // Calculate how many characters can be safely removed
+                       // First, we need to know how many bytes the string exceeds the threshold
+                       chopOffBytes = byteLength - length;
+                       // A character in UTF-8 is at most 4 bytes
+                       // One character must be removed in any case because the
+                       // string is too long
+                       chopOffChars = Math.max( 1, Math.floor( chopOffBytes / 4 ) );
+                       s = s.substr( 0, s.length - chopOffChars );
+               }
+               return s;
+       },
+
+       /**
+        * Cuts a file name to a specific byte length
+        *
+        * @private
+        * @static
+        * @method trimFileNameToByteLength
+        * @param {string} name without extension
+        * @param {string} extension file extension
+        * @return {string} The full name, including extension
+        */
+       trimFileNameToByteLength = function ( name, extension ) {
+               // There is a special byte limit for file names and ... remember the dot
+               return trimToByteLength( name, FILENAME_MAX_BYTES - extension.length - 1 ) + '.' + extension;
+       },
+
        // Polyfill for ES5 Object.create
        createObject = Object.create || ( function () {
                return function ( o ) {
         * Constructor for Title objects with a null return instead of an exception for invalid titles.
         *
         * @static
-        * @method
         * @param {string} title
         * @param {number} [namespace=NS_MAIN] Default namespace
         * @return {mw.Title|null} A valid Title object or null if the title is invalid
                return t;
        };
 
+       /**
+        * Constructor for Title objects from user input altering that input to
+        * produce a title that MediaWiki will accept as legal
+        *
+        * @static
+        * @param {string} title
+        * @param {number} [defaultNamespace=NS_MAIN]
+        *  If given, will used as default namespace for the given title.
+        * @param {Object} [options] additional options
+        * @param {string} [options.fileExtension='']
+        *  If the title is about to be created for the Media or File namespace,
+        *  ensures the resulting Title has the correct extension. Useful, for example
+        *  on systems that predict the type by content-sniffing, not by file extension.
+        *  If different from empty string, `forUploading` is assumed.
+        * @param {boolean} [options.forUploading=true]
+        *  Makes sure that a file is uploadable under the title returned.
+        *  There are pages in the file namespace under which file upload is impossible.
+        *  Automatically assumed if the title is created in the Media namespace.
+        * @return {mw.Title|null} A valid Title object or null if the input cannot be turned into a valid title
+        */
+       Title.newFromUserInput = function ( title, defaultNamespace, options ) {
+               var namespace, m, id, ext, parts, normalizeExtension;
+
+               // defaultNamespace is optional; check whether options moves up
+               if ( arguments.length < 3 && $.type( defaultNamespace ) === 'object' ) {
+                       options = defaultNamespace;
+                       defaultNamespace = undefined;
+               }
+
+               // merge options into defaults
+               options = $.extend( {
+                       fileExtension: '',
+                       forUploading: true
+               }, options );
+
+               normalizeExtension = function ( extension ) {
+                       // Remove only trailing space (that is removed by MW anyway)
+                       extension = extension.toLowerCase().replace(/\s*$/, '');
+                       return extension;
+               };
+
+               namespace = defaultNamespace === undefined ? NS_MAIN : defaultNamespace;
+
+               // Normalise whitespace and remove duplicates
+               title = $.trim( title.replace( rWhitespace, ' ' ) );
+
+               // Process initial colon
+               if ( title !== '' && title.charAt( 0 ) === ':' ) {
+                       // Initial colon means main namespace instead of specified default
+                       namespace = NS_MAIN;
+                       title = title
+                               // Strip colon
+                               .substr( 1 )
+                               // Trim underscores
+                               .replace( rUnderscoreTrim, '' );
+               }
+
+               // Process namespace prefix (if any)
+               m = title.match( rSplit );
+               if ( m ) {
+                       id = getNsIdByName( m[1] );
+                       if ( id !== false ) {
+                               // Ordinary namespace
+                               namespace = id;
+                               title = m[2];
+                       }
+               }
+
+               if ( namespace === NS_MEDIA
+                       || ( ( options.forUploading || options.fileExtension ) && ( namespace === NS_FILE ) )
+               ) {
+
+                       title = sanitize( title, [ 'generalRule', 'fileRule' ] );
+
+                       // Operate on the file extension
+                       // Although it is possible having spaces between the name and the ".ext" this isn't nice for
+                       // operating systems hiding file extensions -> strip them later on
+                       parts = title.split( '.' );
+
+                       if ( parts.length > 1 ) {
+
+                               // Get the last part, which is supposed to be the file extension
+                               ext = parts.pop();
+
+                               // Does the supplied file name carry the desired file extension?
+                               if ( options.fileExtension
+                                       && normalizeExtension( ext ) !== normalizeExtension( options.fileExtension )
+                               ) {
+
+                                       // No, push back, whatever there was after the dot
+                                       parts.push( ext );
+
+                                       // And add the desired file extension later
+                                       ext = options.fileExtension;
+                               }
+
+                               // Remove whitespace of the name part (that W/O extension)
+                               title = $.trim( parts.join( '.' ) );
+
+                               // Cut, if too long and append file extension
+                               title = trimFileNameToByteLength( title, ext );
+
+                       } else {
+
+                               // Missing file extension
+                               title = $.trim( parts.join( '.' ) );
+
+                               if ( options.fileExtension ) {
+
+                                       // Cut, if too long and append the desired file extension
+                                       title = trimFileNameToByteLength( title, options.fileExtension );
+
+                               } else {
+
+                                       // Name has no file extension and a fallback wasn't provided either
+                                       return null;
+                               }
+                       }
+               } else {
+
+                       title = sanitize( title, [ 'generalRule' ] );
+
+                       // Cut titles exceeding the TITLE_MAX_BYTES byte size limit
+                       // (size of underlying database field)
+                       if ( namespace !== NS_SPECIAL ) {
+                               title = trimToByteLength( title, TITLE_MAX_BYTES );
+                       }
+               }
+
+               // Any remaining initial :s are illegal.
+               title = title.replace( /^\:+/, '' );
+
+               return Title.newFromText( title, namespace );
+       };
+
+       /**
+        * Sanitizes a file name as supplied by the user, originating in the user's file system
+        * so it is most likely a valid MediaWiki title and file name after processing.
+        * Returns null on fatal errors.
+        *
+        * @static
+        * @param {string} uncleanName The unclean file name including file extension but
+        *   without namespace
+        * @param {string} [fileExtension] the desired file extension
+        * @return {mw.Title|null} A valid Title object or null if the title is invalid
+        */
+       Title.newFromFileName = function ( uncleanName, fileExtension ) {
+
+               return Title.newFromUserInput( 'File:' + uncleanName, {
+                       fileExtension: fileExtension,
+                       forUploading: true
+               } );
+       };
+
        /**
         * Get the file title from an image element
         *
index 5ece31b..7ab309a 100644 (file)
                        assert.equal( title.getRelativeText( thisCase.relativeTo ), thisCase.expectedResult );
                }
        } );
+
+       QUnit.test( 'newFromUserInput', 8, function ( assert ) {
+               var title, i, thisCase, prefix,
+                       cases = [
+                               {
+                                       title: 'DCS0001557854455.JPG',
+                                       defaultNamespace: 0,
+                                       options: {
+                                               fileExtension: 'PNG'
+                                       },
+                                       expected: 'DCS0001557854455.JPG',
+                                       description: 'Title in normal namespace without anything invalid but with "file extension"'
+                               },
+                               {
+                                       title: 'MediaWiki:Msg-awesome',
+                                       defaultNamespace: undefined,
+                                       expected: 'MediaWiki:Msg-awesome',
+                                       description: 'Full title (page in MediaWiki namespace) supplied as string'
+                               },
+                               {
+                                       title: 'The/Mw/Sound.flac',
+                                       defaultNamespace: -2,
+                                       expected: 'Media:The-Mw-Sound.flac',
+                                       description: 'Page in Media-namespace without explicit options'
+                               },
+                               {
+                                       title: 'File:The/Mw/Sound.kml',
+                                       defaultNamespace: 6,
+                                       options: {
+                                               forUploading: false
+                                       },
+                                       expected: 'File:The/Mw/Sound.kml',
+                                       description: 'Page in File-namespace without explicit options'
+                               }
+                       ];
+
+               for ( i = 0; i < cases.length; i++ ) {
+                       thisCase = cases[i];
+                       title = mw.Title.newFromUserInput( thisCase.title, thisCase.defaultNamespace, thisCase.options );
+
+                       if ( thisCase.expected !== undefined ) {
+                               prefix = '[' + thisCase.description + '] ';
+
+                               assert.notStrictEqual( title, null, prefix + 'Parses successfully' );
+                               assert.equal( title.toText(), thisCase.expected, prefix + 'Title as expected' );
+                       } else {
+                               assert.strictEqual( title, null, thisCase.description + ', should not produce an mw.Title object' );
+                       }
+               }
+       } );
+
+       QUnit.test( 'newFromFileName', 62, function ( assert ) {
+               var title, i, thisCase, prefix,
+                       cases = [
+                               {
+                                       fileName: 'DCS0001557854455.JPG',
+                                       typeOfName: 'Standard camera output',
+                                       nameText: 'DCS0001557854455',
+                                       prefixedText: 'File:DCS0001557854455.JPG',
+                                       extensionDesired: 'jpg'
+                               },
+                               {
+                                       fileName: 'File:Sample.png',
+                                       typeOfName: 'Carrying namespace',
+                                       nameText: 'File-Sample',
+                                       prefixedText: 'File:File-Sample.png'
+                               },
+                               {
+                                       fileName: 'Treppe 2222 Test upload.jpg',
+                                       typeOfName: 'File name with spaces in it and lower case file extension',
+                                       nameText: 'Treppe 2222 Test upload',
+                                       prefixedText: 'File:Treppe 2222 Test upload.jpg',
+                                       extensionDesired: 'JPG'
+                               },
+                               {
+                                       fileName: 'I contain a \ttab.jpg',
+                                       typeOfName: 'Name containing a tab character',
+                                       nameText: 'I contain a tab',
+                                       prefixedText: 'File:I contain a tab.jpg'
+                               },
+                               {
+                                       fileName: 'I_contain multiple__ ___ _underscores.jpg',
+                                       typeOfName: 'Name containing multiple underscores',
+                                       nameText: 'I contain multiple underscores',
+                                       prefixedText: 'File:I contain multiple underscores.jpg'
+                               },
+                               {
+                                       fileName: 'I like ~~~~~~~~es.jpg',
+                                       typeOfName: 'Name containing more than three consecutive tilde characters',
+                                       nameText: 'I like ~~es',
+                                       prefixedText: 'File:I like ~~es.jpg'
+                               },
+                               {
+                                       fileName: 'BI\u200EDI.jpg',
+                                       typeOfName: 'Name containing BIDI overrides',
+                                       nameText: 'BIDI',
+                                       prefixedText: 'File:BIDI.jpg'
+                               },
+                               {
+                                       fileName: '100%ab progress.jpg',
+                                       typeOfName: 'File name with URL encoding',
+                                       nameText: '100% ab progress',
+                                       prefixedText: 'File:100% ab progress.jpg'
+                               },
+                               {
+                                       fileName: '<([>]):/#.jpg',
+                                       typeOfName: 'File name with characters not permitted in titles that are replaced',
+                                       nameText: '((()))---',
+                                       prefixedText: 'File:((()))---.jpg'
+                               },
+                               {
+                                       fileName: 'spaces\u0009\u2000\u200A\u200Bx.djvu',
+                                       typeOfName: 'File name with different kind of spaces',
+                                       nameText: 'Spaces \u200Bx',
+                                       prefixedText: 'File:Spaces \u200Bx.djvu'
+                               },
+                               {
+                                       fileName: 'dot.dot.dot.dot.dotdot',
+                                       typeOfName: 'File name with a lot of dots',
+                                       nameText: 'Dot.dot.dot.dot',
+                                       prefixedText: 'File:Dot.dot.dot.dot.dotdot'
+                               },
+                               {
+                                       fileName: 'dot. dot ._dot',
+                                       typeOfName: 'File name with multiple dots and spaces',
+                                       nameText: 'Dot. dot',
+                                       prefixedText: 'File:Dot. dot. dot'
+                               },
+                               {
+                                       fileName: 'dot. dot ._dot',
+                                       typeOfName: 'File name with different file extension desired',
+                                       nameText: 'Dot. dot . dot',
+                                       prefixedText: 'File:Dot. dot . dot.png',
+                                       extensionDesired: 'png'
+                               },
+                               {
+                                       fileName: 'fileWOExt',
+                                       typeOfName: 'File W/O extension with extension desired',
+                                       nameText: 'FileWOExt',
+                                       prefixedText: 'File:FileWOExt.png',
+                                       extensionDesired: 'png'
+                               },
+                               {
+                                       fileName: '𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢𠻗𠻹𠻺𠼭𠼮𠽌𠾴𠾼𠿪𡁜𡁯𡁵𡁶𡁻𡃁𡃉𡇙𢃇𢞵𢫕𢭃𢯊𢱑𢱕𢳂𠻹𠻺𠼭𠼮𠽌𠾴𠾼𠿪𡁜𡁯𡁵𡁶𡁻𡃁𡃉𡇙𢃇𢞵𢫕𢭃𢯊𢱑𢱕𢳂.png',
+                                       typeOfName: 'File name longer than 240 bytes',
+                                       nameText: '𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢𠻗𠻹𠻺𠼭𠼮𠽌𠾴𠾼𠿪𡁜𡁯𡁵𡁶𡁻𡃁𡃉𡇙𢃇𢞵𢫕𢭃𢯊𢱑𢱕𢳂𠻹𠻺𠼭𠼮𠽌𠾴𠾼𠿪𡁜𡁯𡁵𡁶𡁻𡃁𡃉𡇙𢃇𢞵',
+                                       prefixedText: 'File:𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢𠻗𠻹𠻺𠼭𠼮𠽌𠾴𠾼𠿪𡁜𡁯𡁵𡁶𡁻𡃁𡃉𡇙𢃇𢞵𢫕𢭃𢯊𢱑𢱕𢳂𠻹𠻺𠼭𠼮𠽌𠾴𠾼𠿪𡁜𡁯𡁵𡁶𡁻𡃁𡃉𡇙𢃇𢞵.png'
+                               },
+                               {
+                                       fileName: '',
+                                       typeOfName: 'Empty string'
+                               },
+                               {
+                                       fileName: 'foo',
+                                       typeOfName: 'String with only alphabet characters'
+                               }
+                       ];
+
+               for ( i = 0; i < cases.length; i++ ) {
+                       thisCase = cases[i];
+                       title = mw.Title.newFromFileName( thisCase.fileName, thisCase.extensionDesired );
+
+                       if ( thisCase.nameText !== undefined ) {
+                               prefix = '[' + thisCase.typeOfName + '] ';
+
+                               assert.notStrictEqual( title, null, prefix + 'Parses successfully' );
+                               assert.equal( title.getNameText(), thisCase.nameText, prefix + 'Filename matches original' );
+                               assert.equal( title.getPrefixedText(), thisCase.prefixedText, prefix + 'File page title matches original' );
+                               assert.equal( title.getNamespaceId(), 6, prefix + 'Namespace ID matches File namespace' );
+                       } else {
+                               assert.strictEqual( title, null, thisCase.typeOfName + ', should not produce an mw.Title object' );
+                       }
+               }
+       } );
+
 }( mediaWiki, jQuery ) );