From 03e42a53d6daf8a4c42e119688d90caf4aff513f Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Fri, 8 Oct 2004 08:46:25 +0000 Subject: [PATCH] old_text conversion on load, for future UTF-8 conversion of en.wikipedia.org and remaining holdouts. * When not $wgUseLatin1, saved revisions are now marked with 'utf-8' in old_flags * When $wgLegacyEncoding is set, an old row without 'utf-8' in old_flags is converted from $wgLegacyCharset to $wgInputEncoding at load time (after gzip decompression if applicable). Thus the old_text fields will not need to be modified when the wiki is converted; this will mainly be useful after the SCHEMA_WORK changes are made so that the other fields in the old table are discarded (they will need to be separately converted anyway) --- includes/Article.php | 57 ++++++++++++++++++++++++------------ includes/DefaultSettings.php | 11 +++++++ 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/includes/Article.php b/includes/Article.php index 8e724caf0e..5fbeb2ab9e 100644 --- a/includes/Article.php +++ b/includes/Article.php @@ -73,47 +73,68 @@ class Article { $textField = $prefix . 'text'; $flagsField = $prefix . 'flags'; - if ( isset( $row->$flagsField ) ) { + if( isset( $row->$flagsField ) ) { $flags = explode( ',', $row->$flagsField ); } else { $flags = array(); } - if ( isset( $row->$textField ) ) { + if( isset( $row->$textField ) ) { $text = $row->$textField; } else { return false; } - if ( in_array( 'link', $flags ) ) { - # Handle link type - $text = Article::followLink( $text ); - } elseif ( in_array( 'gzip', $flags ) ) { + if( in_array( 'gzip', $flags ) ) { # Deal with optional compression of archived pages. # This can be done periodically via maintenance/compressOld.php, and # as pages are saved if $wgCompressRevisions is set. - return gzinflate( $text ); + $text = gzinflate( $text ); + } + + global $wgLegacyEncoding; + if( $wgLegacyEncoding && !in_array( 'utf-8', $flags ) ) { + # Old revisions kept around in a legacy encoding? + # Upconvert on demand. + global $wgInputEncoding, $wgContLang; + $text = $wgContLang->iconv( $wgLegacyEncoding, $wgInputEncoding, $text ); + } + + if( in_array( 'link', $flags ) ) { + # Handle link type + $text = Article::followLink( $text ); } return $text; } /** - * If $wgCompressRevisions is enabled, we will compress datas + * If $wgCompressRevisions is enabled, we will compress data. + * The input string is modified in place. + * Return value is the flags field: contains 'gzip' if the + * data is compressed, and 'utf-8' if we're saving in UTF-8 + * mode. + * * @static * @param mixed $text reference to a text - * @return string 'gzip' if it get compressed, '' overwise + * @return string */ function compressRevisionText( &$text ) { - global $wgCompressRevisions; - if( !$wgCompressRevisions ) { - return ''; - } - if( !function_exists( 'gzdeflate' ) ) { - wfDebug( "Article::compressRevisionText() -- no zlib support, not compressing\n" ); - return ''; + global $wgCompressRevisions, $wgUseLatin1; + $flags = array(); + if( !$wgUseLatin1 ) { + # Revisions not marked this way will be converted + # on load if $wgLegacyCharset is set in the future. + $flags[] = 'utf-8'; + } + if( $wgCompressRevisions ) { + if( function_exists( 'gzdeflate' ) ) { + $text = gzdeflate( $text ); + $flags[] = 'gzip'; + } else { + wfDebug( "Article::compressRevisionText() -- no zlib support, not compressing\n" ); + } } - $text = gzdeflate( $text ); - return 'gzip'; + return implode( ',', $flags ); } /** diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index 6f0fc3c075..a7f319a283 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -285,6 +285,17 @@ $wgInputEncoding = 'ISO-8859-1'; # LanguageUtf8.php normally overrides this $wgOutputEncoding = 'ISO-8859-1'; # unless you set the next option to true: $wgUseLatin1 = false; # Enable ISO-8859-1 compatibility mode $wgEditEncoding = ''; + +# Set this to eg 'ISO-8859-1' to perform character set +# conversion when loading old revisions not marked with +# "utf-8" flag. Use this when converting wiki to UTF-8 +# without the burdensome mass conversion of old text data. +# +# NOTE! This DOES NOT touch any fields other than old_text. +# Titles, comments, user names, etc still must be converted +# en masse in the database before continuing as a UTF-8 wiki. +$wgLegacyEncoding = false; + $wgMimeType = 'text/html'; $wgDocType = '-//W3C//DTD XHTML 1.0 Transitional//EN'; $wgDTD = 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'; -- 2.20.1