EditPage: Don't allow clients that mangle unicode to edit

author Max Semenik <maxsem.wiki@gmail.com>

Mon, 28 Aug 2017 21:14:44 +0000 (14:14 -0700)

committer Kunal Mehta <legoktm@member.fsf.org>

Thu, 21 Sep 2017 02:57:51 +0000 (19:57 -0700)
author Max Semenik <maxsem.wiki@gmail.com>
Mon, 28 Aug 2017 21:14:44 +0000 (14:14 -0700)
committer Kunal Mehta <legoktm@member.fsf.org>
Thu, 21 Sep 2017 02:57:51 +0000 (19:57 -0700)
diff --git a/RELEASE-NOTES-1.30 b/RELEASE-NOTES-1.30

index 8517a8f..513dc3a 100644 (file)
--- a/RELEASE-NOTES-1.30
+++ b/RELEASE-NOTES-1.30
@@ -73,6 +73,7 @@ section).
  * (T138166) Added ability for users to prohibit other users from sending them
    emails with Special:Emailuser. Can be enabled by setting
    $wgEnableUserEmailBlacklist to true.
+* $wgBrowserBlacklist is deprecated, and changing it will have no effect.
  
  === External library changes in 1.30 ===
  
@@ -221,6 +222,7 @@ changes to languages because of Phabricator reports.
  * wfShellExec() and related functions are deprecated, use Shell::command().
  * (T138166) SpecialEmailUser::getTarget() now requires a second argument, the sending
    user object. Using the method without the second argument is deprecated.
+* (T67297) Browsers that don't support Unicode will have their edits rejected.
  
  == Compatibility ==
  MediaWiki 1.30 requires PHP 5.5.9 or later. There is experimental support for
diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php

index cdb7ce8..5630dcb 100644 (file)
--- a/includes/DefaultSettings.php
+++ b/includes/DefaultSettings.php
@@ -2980,46 +2980,9 @@ $wgAllUnicodeFixes = false;
  $wgLegacyEncoding = false;
  
  /**
- * Browser Blacklist for unicode non compliant browsers. Contains a list of
- * regexps : "/regexp/"  matching problematic browsers. These browsers will
- * be served encoded unicode in the edit box instead of real unicode.
+ * @deprecated since 1.30, does nothing
   */
-$wgBrowserBlackList = [
-       /**
-        * Netscape 2-4 detection
-        * The minor version may contain strings such as "Gold" or "SGoldC-SGI"
-        * Lots of non-netscape user agents have "compatible", so it's useful to check for that
-        * with a negative assertion. The [UIN] identifier specifies the level of security
-        * in a Netscape/Mozilla browser, checking for it rules out a number of fakers.
-        * The language string is unreliable, it is missing on NS4 Mac.
-        *
-        * Reference: http://www.psychedelix.com/agents/index.shtml
-        */
-       '/^Mozilla\/2\.[^ ]+ [^(]*?\((?!compatible).*; [UIN]/',
-       '/^Mozilla\/3\.[^ ]+ [^(]*?\((?!compatible).*; [UIN]/',
-       '/^Mozilla\/4\.[^ ]+ [^(]*?\((?!compatible).*; [UIN]/',
-
-       /**
-        * MSIE on Mac OS 9 is teh sux0r, converts þ to <thorn>, ð to <eth>,
-        * Þ to <THORN> and Ð to <ETH>
-        *
-        * Known useragents:
-        * - Mozilla/4.0 (compatible; MSIE 5.0; Mac_PowerPC)
-        * - Mozilla/4.0 (compatible; MSIE 5.15; Mac_PowerPC)
-        * - Mozilla/4.0 (compatible; MSIE 5.23; Mac_PowerPC)
-        * - [...]
-        *
-        * @link https://en.wikipedia.org/w/index.php?diff=12356041&oldid=12355864
-        * @link https://en.wikipedia.org/wiki/Template%3AOS9
-        */
-       '/^Mozilla\/4\.0 \(compatible; MSIE \d+\.\d+; Mac_PowerPC\)/',
-
-       /**
-        * Google wireless transcoder, seems to eat a lot of chars alive
-        * https://it.wikipedia.org/w/index.php?title=Luciano_Ligabue&diff=prev&oldid=8857361
-        */
-       '/^Mozilla\/4\.0 \(compatible; MSIE 6.0; Windows NT 5.0; Google Wireless Transcoder;\)/'
-];
+$wgBrowserBlackList = [];
  
  /**
   * If set to true, the MediaWiki 1.4 to 1.5 schema conversion will
diff --git a/includes/EditPage.php b/includes/EditPage.php

index 0ea61c0..b40a054 100644 (file)
--- a/includes/EditPage.php
+++ b/includes/EditPage.php
@@ -40,6 +40,11 @@ use Wikimedia\ScopedCallback;
   * headaches, which may be fatal.
   */
  class EditPage {
+       /**
+        * Used for Unicode support checks
+        */
+       const UNICODE_CHECK = 'ℳ𝒲♥𝓊𝓃𝒾𝒸ℴ𝒹ℯ';
+
         /**
          * Status: Article successfully updated
          */
@@ -177,6 +182,11 @@ class EditPage {
          */
         const AS_CANNOT_USE_CUSTOM_MODEL = 241;
  
+       /**
+        * Status: edit rejected because browser doesn't support Unicode.
+        */
+       const AS_UNICODE_NOT_SUPPORTED = 242;
+
         /**
          * HTML id and name for the beginning of the edit form.
          */
@@ -413,6 +423,11 @@ class EditPage {
          */
         private $isOldRev = false;
  
+       /**
+        * @var string|null What the user submitted in the 'wpUnicodeCheck' field
+        */
+       private $unicodeCheck;
+
         /**
          * @param Article $article
          */
@@ -865,7 +880,7 @@ class EditPage {
                         # These fields need to be checked for encoding.
                         # Also remove trailing whitespace, but don't remove _initial_
                         # whitespace from the text boxes. This may be significant formatting.
-                       $this->textbox1 = $this->safeUnicodeInput( $request, 'wpTextbox1' );
+                       $this->textbox1 = rtrim( $request->getText( 'wpTextbox1' ) );
                         if ( !$request->getCheck( 'wpTextbox2' ) ) {
                                 // Skip this if wpTextbox2 has input, it indicates that we came
                                 // from a conflict page with raw page text, not a custom form
@@ -876,6 +891,8 @@ class EditPage {
                                 }
                         }
  
+                       $this->unicodeCheck = $request->getText( 'wpUnicodeCheck' );
+
                         $this->summary = $request->getText( 'wpSummary' );
  
                         # If the summary consists of a heading, e.g. '==Foobar==', extract the title from the
@@ -1544,6 +1561,7 @@ class EditPage {
  
                         case self::AS_CANNOT_USE_CUSTOM_MODEL:
                         case self::AS_PARSE_ERROR:
+                       case self::AS_UNICODE_NOT_SUPPORTED:
                                 $out->addWikiText( '<div class="error">' . "\n" . $status->getWikiText() . '</div>' );
                                 return true;
  
@@ -1745,6 +1763,12 @@ class EditPage {
                         return $status;
                 }
  
+               if ( $this->unicodeCheck !== self::UNICODE_CHECK ) {
+                       $status->fatal( 'unicode-support-fail' );
+                       $status->value = self::AS_UNICODE_NOT_SUPPORTED;
+                       return $status;
+               }
+
                 $request = $this->context->getRequest();
                 $spam = $request->getText( 'wpAntispam' );
                 if ( $spam !== '' ) {
@@ -2673,6 +2697,9 @@ class EditPage {
                         call_user_func_array( $formCallback, [ &$out ] );
                 }
  
+               // Add a check for Unicode support
+               $out->addHTML( Html::hidden( 'wpUnicodeCheck', self::UNICODE_CHECK ) );
+
                 // Add an empty field to trip up spambots
                 $out->addHTML(
                         Xml::openElement( 'div', [ 'id' => 'antispam-container', 'style' => 'display: none;' ] )
@@ -2947,10 +2974,6 @@ class EditPage {
                                 $out->addWikiText( $this->hookError );
                         }
  
-                       if ( !$this->checkUnicodeCompliantBrowser() ) {
-                               $out->addWikiMsg( 'nonunicodebrowser' );
-                       }
-
                         if ( $this->section != 'new' ) {
                                 $revision = $this->mArticle->getRevisionFetched();
                                 if ( $revision ) {
@@ -3221,10 +3244,6 @@ class EditPage {
                 $out->addHTML( Html::hidden( 'wpEdittime', $this->edittime ) );
                 $out->addHTML( Html::hidden( 'editRevId', $this->editRevId ) );
                 $out->addHTML( Html::hidden( 'wpScrolltop', $this->scrolltop, [ 'id' => 'wpScrolltop' ] ) );
-
-               if ( !$this->checkUnicodeCompliantBrowser() ) {
-                       $out->addHTML( Html::hidden( 'safemode', '1' ) );
-               }
         }
  
         protected function showFormAfterText() {
@@ -3318,8 +3337,7 @@ class EditPage {
         }
  
         protected function showTextbox( $text, $name, $customAttribs = [] ) {
-               $wikitext = $this->safeUnicodeOutput( $text );
-               $wikitext = $this->addNewLineAtEnd( $wikitext );
+               $wikitext = $this->addNewLineAtEnd( $text );
  
                 $attribs = $this->buildTextboxAttribs( $name, $customAttribs, $this->context->getUser() );
  
@@ -4461,138 +4479,31 @@ class EditPage {
                 $out->addReturnTo( $this->getContextTitle(), [ 'action' => 'edit' ] );
         }
  
-       /**
-        * Check if the browser is on a blacklist of user-agents known to
-        * mangle UTF-8 data on form submission. Returns true if Unicode
-        * should make it through, false if it's known to be a problem.
-        * @return bool
-        */
-       private function checkUnicodeCompliantBrowser() {
-               global $wgBrowserBlackList;
-
-               $currentbrowser = $this->context->getRequest()->getHeader( 'User-Agent' );
-               if ( $currentbrowser === false ) {
-                       // No User-Agent header sent? Trust it by default...
-                       return true;
-               }
-
-               foreach ( $wgBrowserBlackList as $browser ) {
-                       if ( preg_match( $browser, $currentbrowser ) ) {
-                               return false;
-                       }
-               }
-               return true;
-       }
-
         /**
          * Filter an input field through a Unicode de-armoring process if it
          * came from an old browser with known broken Unicode editing issues.
          *
+        * @deprecated since 1.30, does nothing
+        *
          * @param WebRequest $request
          * @param string $field
          * @return string
          */
         protected function safeUnicodeInput( $request, $field ) {
-               $text = rtrim( $request->getText( $field ) );
-               return $request->getBool( 'safemode' )
-                       ? $this->unmakeSafe( $text )
-                       : $text;
+               return rtrim( $request->getText( $field ) );
         }
  
         /**
          * Filter an output field through a Unicode armoring process if it is
          * going to an old browser with known broken Unicode editing issues.
          *
+        * @deprecated since 1.30, does nothing
+        *
          * @param string $text
          * @return string
          */
         protected function safeUnicodeOutput( $text ) {
-               return $this->checkUnicodeCompliantBrowser()
-                       ? $text
-                       : $this->makeSafe( $text );
-       }
-
-       /**
-        * A number of web browsers are known to corrupt non-ASCII characters
-        * in a UTF-8 text editing environment. To protect against this,
-        * detected browsers will be served an armored version of the text,
-        * with non-ASCII chars converted to numeric HTML character references.
-        *
-        * Preexisting such character references will have a 0 added to them
-        * to ensure that round-trips do not alter the original data.
-        *
-        * @param string $invalue
-        * @return string
-        */
-       private function makeSafe( $invalue ) {
-               // Armor existing references for reversibility.
-               $invalue = strtr( $invalue, [ "&#x" => "&#x0" ] );
-
-               $bytesleft = 0;
-               $result = "";
-               $working = 0;
-               $valueLength = strlen( $invalue );
-               for ( $i = 0; $i < $valueLength; $i++ ) {
-                       $bytevalue = ord( $invalue[$i] );
-                       if ( $bytevalue <= 0x7F ) { // 0xxx xxxx
-                               $result .= chr( $bytevalue );
-                               $bytesleft = 0;
-                       } elseif ( $bytevalue <= 0xBF ) { // 10xx xxxx
-                               $working = $working << 6;
-                               $working += ( $bytevalue & 0x3F );
-                               $bytesleft--;
-                               if ( $bytesleft <= 0 ) {
-                                       $result .= "&#x" . strtoupper( dechex( $working ) ) . ";";
-                               }
-                       } elseif ( $bytevalue <= 0xDF ) { // 110x xxxx
-                               $working = $bytevalue & 0x1F;
-                               $bytesleft = 1;
-                       } elseif ( $bytevalue <= 0xEF ) { // 1110 xxxx
-                               $working = $bytevalue & 0x0F;
-                               $bytesleft = 2;
-                       } else { // 1111 0xxx
-                               $working = $bytevalue & 0x07;
-                               $bytesleft = 3;
-                       }
-               }
-               return $result;
-       }
-
-       /**
-        * Reverse the previously applied transliteration of non-ASCII characters
-        * back to UTF-8. Used to protect data from corruption by broken web browsers
-        * as listed in $wgBrowserBlackList.
-        *
-        * @param string $invalue
-        * @return string
-        */
-       private function unmakeSafe( $invalue ) {
-               $result = "";
-               $valueLength = strlen( $invalue );
-               for ( $i = 0; $i < $valueLength; $i++ ) {
-                       if ( ( substr( $invalue, $i, 3 ) == "&#x" ) && ( $invalue[$i + 3] != '0' ) ) {
-                               $i += 3;
-                               $hexstring = "";
-                               do {
-                                       $hexstring .= $invalue[$i];
-                                       $i++;
-                               } while ( ctype_xdigit( $invalue[$i] ) && ( $i < strlen( $invalue ) ) );
-
-                               // Do some sanity checks. These aren't needed for reversibility,
-                               // but should help keep the breakage down if the editor
-                               // breaks one of the entities whilst editing.
-                               if ( ( substr( $invalue, $i, 1 ) == ";" ) && ( strlen( $hexstring ) <= 6 ) ) {
-                                       $codepoint = hexdec( $hexstring );
-                                       $result .= UtfNormal\Utils::codepointToUtf8( $codepoint );
-                               } else {
-                                       $result .= "&#x" . $hexstring . substr( $invalue, $i, 1 );
-                               }
-                       } else {
-                               $result .= substr( $invalue, $i, 1 );
-                       }
-               }
-               // reverse the transform that we made for reversibility reasons.
-               return strtr( $result, [ "&#x0" => "&#x" ] );
+               return $text;
         }
  
         /**
diff --git a/includes/api/ApiEditPage.php b/includes/api/ApiEditPage.php

index 4360b4d..94d6e97 100644 (file)
--- a/includes/api/ApiEditPage.php
+++ b/includes/api/ApiEditPage.php
@@ -266,6 +266,7 @@ class ApiEditPage extends ApiBase {
                         'wpIgnoreBlankArticle' => true,
                         'wpIgnoreSelfRedirect' => true,
                         'bot' => $params['bot'],
+                       'wpUnicodeCheck' => EditPage::UNICODE_CHECK,
                 ];
  
                 if ( !is_null( $params['summary'] ) ) {
diff --git a/languages/i18n/en.json b/languages/i18n/en.json

index 47e6205..6d06e40 100644 (file)
--- a/languages/i18n/en.json
+++ b/languages/i18n/en.json
@@ -704,8 +704,8 @@
         "explainconflict": "Someone else has changed this page since you started editing it.\nThe upper text area contains the page text as it currently exists.\nYour changes are shown in the lower text area.\nYou will have to merge your changes into the existing text.\n<strong>Only</strong> the text in the upper text area will be saved when you press \"$1\".",
         "yourtext": "Your text",
         "storedversion": "Stored revision",
-       "nonunicodebrowser": "<strong>Warning: Your browser is not Unicode compliant.</strong>\nA workaround is in place to allow you to safely edit pages: Non-ASCII characters will appear in the edit box as hexadecimal codes.",
         "editingold": "<strong>Warning: You are editing an out-of-date revision of this page.</strong>\nIf you save it, any changes made since this revision will be lost.",
+       "unicode-support-fail": "It appears that your browser does not support Unicode. It is required to edit pages, so your edit was not saved.",
         "yourdiff": "Differences",
         "copyrightwarning": "Please note that all contributions to {{SITENAME}} are considered to be released under the $2 (see $1 for details).\nIf you do not want your writing to be edited mercilessly and redistributed at will, then do not submit it here.<br />\nYou are also promising us that you wrote this yourself, or copied it from a public domain or similar free resource.\n<strong>Do not submit copyrighted work without permission!</strong>",
         "copyrightwarning2": "Please note that all contributions to {{SITENAME}} may be edited, altered, or removed by other contributors.\nIf you do not want your writing to be edited mercilessly, then do not submit it here.<br />\nYou are also promising us that you wrote this yourself, or copied it from a public domain or similar free resource (see $1 for details).\n<strong>Do not submit copyrighted work without permission!</strong>",
diff --git a/languages/i18n/qqq.json b/languages/i18n/qqq.json

index 0a6e91b..7534322 100644 (file)
--- a/languages/i18n/qqq.json
+++ b/languages/i18n/qqq.json
@@ -896,8 +896,8 @@
         "explainconflict": "Appears at the top of a page when there is an edit conflict.\n\nParameters:\n* $1 – The label of the save button – one of {{msg-mw|savearticle}} or {{msg-mw|savechanges}} on save-labelled wiki, or {{msg-mw|publishpage}} or {{msg-mw|publishchanges}} on publish-labelled wikis.\n\nSee also:\n* {{msg-mw|Savearticle}}",
         "yourtext": "Used in Diff Preview page. The diff is between {{msg-mw|currentrev}} and {{msg-mw|yourtext}}.\n\nAlso used in Edit Conflict page; the diff between {{msg-mw|yourtext}} and {{msg-mw|storedversion}}.",
         "storedversion": "This is used in an edit conflict as the label for the top revision that has been stored, as opposed to your version {{msg-mw|yourtext}} that has not been stored which is shown at the bottom of the page.",
-       "nonunicodebrowser": "Used as warning when editing page.",
         "editingold": "Used as warning when editing an old revision of a page.",
+       "unicode-support-fail": "Error message shown to users if their browser doesn't support Unicode",
         "yourdiff": "Used as h2 header for the diff of the current version of a page with the user's version in case there is an edit conflict or a spam filter hit.",
         "copyrightwarning": "Copyright warning displayed under the edit box in editor. Parameters:\n* $1 - link\n* $2 - license name",
         "copyrightwarning2": "Copyright warning displayed under the edit box in editor\n*$1 - license name",
diff --git a/tests/phpunit/includes/EditPageTest.php b/tests/phpunit/includes/EditPageTest.php

index 9507811..ad0d07a 100644 (file)
--- a/tests/phpunit/includes/EditPageTest.php
+++ b/tests/phpunit/includes/EditPageTest.php
@@ -165,6 +165,10 @@ class EditPageTest extends MediaWikiLangTestCase {
                         $edit['wpStarttime'] = wfTimestampNow();
                 }
  
+               if ( !isset( $edit['wpUnicodeCheck'] ) ) {
+                       $edit['wpUnicodeCheck'] = EditPage::UNICODE_CHECK;
+               }
+
                 $req = new FauxRequest( $edit, true ); // session ??
  
                 $article = new Article( $title );
@@ -697,7 +701,8 @@ hello
                         'wpTextbox1' => serialize( 'non-text content' ),
                         'wpEditToken' => $user->getEditToken(),
                         'wpEdittime' => '',
-                       'wpStarttime' => wfTimestampNow()
+                       'wpStarttime' => wfTimestampNow(),
+                       'wpUnicodeCheck' => EditPage::UNICODE_CHECK,
                 ];
  
                 $req = new FauxRequest( $edit, true );
author	Max Semenik <maxsem.wiki@gmail.com>
	Mon, 28 Aug 2017 21:14:44 +0000 (14:14 -0700)
committer	Kunal Mehta <legoktm@member.fsf.org>
	Thu, 21 Sep 2017 02:57:51 +0000 (19:57 -0700)
RELEASE-NOTES-1.30		patch \| blob \| history
includes/DefaultSettings.php		patch \| blob \| history
includes/EditPage.php		patch \| blob \| history
includes/api/ApiEditPage.php		patch \| blob \| history
languages/i18n/en.json		patch \| blob \| history
languages/i18n/qqq.json		patch \| blob \| history
tests/phpunit/includes/EditPageTest.php		patch \| blob \| history