Far from finished in-place UTF-8 wiki converter.
[lhc/web/wiklou.git] / maintenance / convertUtf8.php
1 <?php
2
3 die("This file is not complete; it's checked in so I don't forget it.");
4
5 /*
6 UTF-8 conversion of DOOOOOOOM
7
8 1. Lock the wiki
9 2. Make a convertlist of all pages
10 3. Enable CONVERTLOCK mode and switch to UTF-8
11 4. As quick as possible, convert the cur, images, *links, user, etc tables. Clear cache tables.
12 5. Unlock the wiki. Attempts to access pages on the convertlist will be trapped to read-only.
13 6. Go through the list, fixing up old revisions. Remove pages from the convertlist.
14 */
15
16
17 if(function_exists("iconv")) {
18 # There are likely to be Windows code page 1252 chars in there.
19 # Convert them to the proper UTF-8 chars if possible.
20 function toUtf8($string) {
21 return wfStrencode(iconv("CP1252", "UTF-8", $string));
22 }
23 } else {
24 # Will work from plain iso 8859-1 and may corrupt these chars
25 function toUtf8($string) {
26 return wfStrencode(utf8_encode($string));
27 }
28 }
29
30
31
32 # user table
33 $sql = "SELECT user_id,user_name,user_real_name,user_options FROM user";
34 $res = wfQuery( $sql, DB_WRITE );
35 print "Converting " . wfNumResults( $res ) . " user accounts:\n";
36 $n = 0;
37 while( $s = wfFetchObject( $res ) ) {
38 $uname = toUtf8( $s->user_name );
39 $ureal = toUtf8( $s->user_real_name );
40 $uoptions = toUtf8( $s->user_options );
41 if( $uname != wfStrencode( $s->user_name ) ||
42 $ureal != wfStrencode( $s->user_real_name ) ||
43 $uoptions != wfStrencode( $s->user_options ) ) {
44 $now = wfTimestampNow();
45 $sql = "UPDATE user
46 SET user_name='$uname',user_real_name='$ureal',
47 user_options='$uoptions',user_touched='$now'
48 WHERE user_id={$s->user_id}";
49 wfQuery( $sql, DB_WRITE );
50 $wgMemc->delete( "$wgDBname:user:id:{$s->user_id}" );
51 $u++;
52 }
53 if( ++$n % 100 == 0 ) print "$n\n";
54 }
55 wfFreeResult( $res );
56 if( $n ) {
57 printf("%2.02%% required conversion.\n\n", $u / $n);
58 } else {
59 print "None?\n\n";
60 }
61
62 # ipblocks
63 $sql = "SELECT DISTINCT ipb_reason FROM ipblocks";
64 $res = wfQuery( $sql, DB_WRITE );
65 print "Converting " . wfNumResults( $res ) . " IP block comments:\n";
66 $n = 0;
67 while( $s = wfFetchObject( $res ) ) {
68 $ucomment = toUtf8($s->ipb_reason);
69 $ocomment = wfStrencode( $s->ipb_reason );
70 if( $u != $o ) {
71 $sql = "UPDATE ipblocks SET ipb_reason='$ucomment' WHERE ipb_reason='$ocomment'";
72 wfQuery( $sql, DB_WRITE );
73 $u++;
74 }
75 if( ++$n % 100 == 0 ) print "$n\n";
76 }
77 wfFreeResult( $res );
78 if( $n ) {
79 printf("%2.02%% required conversion.\n\n", $u / $n);
80 } else {
81 print "None?\n\n";
82 }
83
84 # image
85 $sql = "SELECT img_name,img_description,img_user_text FROM image";
86 img_name --> also need to rename files
87 img_description
88 img_user_text
89
90 oldimage
91 oi_name
92 oi_archive_name --> also need to rename files
93 oi_user_text
94
95 recentchanges
96 rc_user_text
97 rc_title
98 rc_comment
99
100 # searchindex
101 print "Clearing searchindex... don't forget to rebuild it.\n";
102 $sql = "DELETE FROM searchindex";
103 wfQuery( $sql, DB_WRITE );
104
105 # linkscc
106 print "Clearing linkscc...\n";
107 $sql = "DELETE FROM linkscc";
108 wfQuery( $sql, DB_WRITE );
109
110 # querycache: just rebuild these
111 print "Clearing querycache...\n";
112 $sql = "DELETE FROM querycache";
113 wfQuery( $sql, DB_WRITE );
114
115 # objectcache
116 print "Clearing objectcache...\n";
117 $sql = "DELETE FROM objectcache";
118 wfQuery( $sql, DB_WRITE );
119
120
121 function unicodeLinks( $table, $field ) {
122 $sql = "SELECT DISTINCT $field FROM $table WHERE $field RLIKE '[\x80-\xff]'";
123 $res = wfQuery( $sql, DB_WRITE );
124 print "Converting " . wfNumResults( $res ) . " from $table:\n";
125 $n = 0;
126 while( $s = wfFetchObject( $res ) ) {
127 $ulink = toUtf8( $s->$field );
128 $olink = wfStrencode( $s->$field );
129 $sql = "UPDATE $table SET $field='$ulink' WHERE $field='$olink'";
130 wfQuery( $sql, DB_WRITE );
131 if( ++$n % 100 == 0 ) print "$n\n";
132 }
133 wfFreeResult( $res );
134 print "Done.\n\n";
135 }
136 unicodeLinks( "brokenlinks", "bl_to" );
137 unicodeLinks( "imagelinks", "il_to" );
138 unicodeLinks( "categorylinks", "cl_to" );
139
140
141 # The big guys...
142 $sql = "SELECT cur_id,cur_namespace,cur_title,cur_text,cur_user_text FROM cur
143 WHERE cur_title rlike '[\x80-\xff]' OR cur_comment rlike '[\x80-\xff]'
144 OR cur_user_text rlike '[\x80-\xff]' OR cur_text rlike '[\x80-\xff]'";
145 $res = wfQuery( $sql, DB_WRITE );
146 print "Converting " . wfNumResults( $res ) . " cur pages:\n";
147 $n = 0;
148 while( $s = wfFetchObject( $res ) ) {
149 $utitle = toUtf8( $s->cur_title );
150 $uuser = toUtf8( $s->cur_user_text );
151 $ucomment = toUtf8( $s->cur_comment );
152 $utext = toUtf8( $s->cur_text );
153 $now = wfTimestampNow();
154
155 $sql = "UPDATE cur
156 SET cur_title='$utitle',cur_user_text='$uuser',
157 cur_comment='$ucomment',cur_text='$utext'
158 WHERE cur_id={$s->cur_id}";
159 wfQuery( $sql, DB_WRITE );
160 #$wgMemc->delete( "$wgDBname:user:id:{$s->user_id}" );
161
162 $otitle = wfStrencode( $s->cur_title );
163 if( $otitle != $utitle ) {
164 # Also update titles in watchlist and old
165 $sql = "UPDATE old SET old_title='$utitle'
166 WHERE old_namespace={$s->cur_namespace} AND old_title='$otitle'";
167 wfQuery( $sql, DB_WRITE );
168
169 $ns = IntVal( $s->cur_namespace) & ~1;
170 $sql = "UPDATE watchlist SET wl_title='$utitle'
171 WHERE wl_namespace=$ns AND wl_title='$otitle'";
172 wfQuery( $sql, DB_WRITE );
173 $u++;
174 }
175
176 if( ++$n % 100 == 0 ) print "$n\n";
177 }
178 wfFreeResult( $res );
179 if( $n ) {
180 printf("Updated old/watchlist titles on %2.02%%.\n\n", $u / $n);
181 } else {
182 print "Didn't update any old/watchlist titles.\n\n";
183 }
184
185 /*
186 old
187 old_title
188 old_text -> may be gzipped
189 old_comment
190 old_user_text
191
192 archive
193 ar_title
194 ar_text -> may be gzipped
195 ar_comment
196 ar_user_text
197 */
198
199 ?>