Updated deleteLinksFromNonexistent function:
[lhc/web/wiklou.git] / maintenance / refreshLinks.inc
1 <?php
2 /**
3 * @todo document
4 * @file
5 * @ingroup Maintenance
6 */
7
8 function refreshLinks( $start, $newOnly = false, $maxLag = false, $end = 0, $redirectsOnly = false, $oldRedirectsOnly = false ) {
9 global $wgUser, $wgParser, $wgUseTidy;
10
11 $reportingInterval = 100;
12 $fname = 'refreshLinks';
13 $dbr = wfGetDB( DB_SLAVE );
14 $start = intval( $start );
15
16 # Don't generate TeX PNGs (lack of a sensible current directory causes errors anyway)
17 $wgUser->setOption('math', MW_MATH_SOURCE);
18
19 # Don't generate extension images (e.g. Timeline)
20 if( method_exists( $wgParser, "clearTagHooks" ) ) {
21 $wgParser->clearTagHooks();
22 }
23
24 # Don't use HTML tidy
25 $wgUseTidy = false;
26
27 $what = $redirectsOnly ? "redirects" : "links";
28
29 if( $oldRedirectsOnly ) {
30 # This entire code path is cut-and-pasted from below. Hurrah.
31 $res = $dbr->query(
32 "SELECT page_id ".
33 "FROM page ".
34 "LEFT JOIN redirect ON page_id=rd_from ".
35 "WHERE page_is_redirect=1 AND rd_from IS NULL AND ".
36 ($end == 0 ? "page_id >= $start"
37 : "page_id BETWEEN $start AND $end"),
38 $fname
39 );
40 $num = $dbr->numRows( $res );
41 print "Refreshing $num old redirects from $start...\n";
42
43 while( $row = $dbr->fetchObject( $res ) ) {
44 if ( !( ++$i % $reportingInterval ) ) {
45 print "$i\n";
46 wfWaitForSlaves( $maxLag );
47 }
48 fixRedirect( $row->page_id );
49 }
50 } elseif( $newOnly ) {
51 print "Refreshing $what from ";
52 $res = $dbr->select( 'page',
53 array( 'page_id' ),
54 array(
55 'page_is_new' => 1,
56 "page_id >= $start" ),
57 $fname
58 );
59 $num = $dbr->numRows( $res );
60 print "$num new articles...\n";
61
62 $i = 0;
63 while ( $row = $dbr->fetchObject( $res ) ) {
64 if ( !( ++$i % $reportingInterval ) ) {
65 print "$i\n";
66 wfWaitForSlaves( $maxLag );
67 }
68 if($redirectsOnly)
69 fixRedirect( $row->page_id );
70 else
71 fixLinksFromArticle( $row->page_id );
72 }
73 } else {
74 print "Refreshing $what table.\n";
75 if ( !$end ) {
76 $end = $dbr->selectField( 'page', 'max(page_id)', false );
77 }
78 print("Starting from page_id $start of $end.\n");
79
80 for ($id = $start; $id <= $end; $id++) {
81
82 if ( !($id % $reportingInterval) ) {
83 print "$id\n";
84 wfWaitForSlaves( $maxLag );
85 }
86 if($redirectsOnly)
87 fixRedirect( $id );
88 else
89 fixLinksFromArticle( $id );
90 }
91 }
92 }
93
94 function fixRedirect( $id ){
95 global $wgTitle, $wgArticle;
96
97 $wgTitle = Title::newFromID( $id );
98 $dbw = wfGetDB( DB_MASTER );
99
100 if ( is_null( $wgTitle ) ) {
101 return;
102 }
103 $wgArticle = new Article($wgTitle);
104
105 $rt = $wgArticle->followRedirect();
106
107 if($rt == false || !is_object($rt))
108 return;
109
110 $wgArticle->updateRedirectOn($dbw,$rt);
111 }
112
113 function fixLinksFromArticle( $id ) {
114 global $wgTitle, $wgParser;
115
116 $wgTitle = Title::newFromID( $id );
117 $dbw = wfGetDB( DB_MASTER );
118
119 $linkCache =& LinkCache::singleton();
120 $linkCache->clear();
121
122 if ( is_null( $wgTitle ) ) {
123 return;
124 }
125 $dbw->begin();
126
127 $revision = Revision::newFromTitle( $wgTitle );
128 if ( !$revision ) {
129 return;
130 }
131
132 $options = new ParserOptions;
133 $parserOutput = $wgParser->parse( $revision->getText(), $wgTitle, $options, true, true, $revision->getId() );
134 $update = new LinksUpdate( $wgTitle, $parserOutput, false );
135 $update->doUpdate();
136 $dbw->immediateCommit();
137 }
138
139 /*
140 * Removes non-existing links from pages from pagelinks, imagelinks,
141 * categorylinks, templatelinks and externallinks tables.
142 *
143 * @param $maxLag
144 * @param $batchSize The size of deletion batches
145 *
146 * @author Merlijn van Deen <valhallasw@arctus.nl>
147 */
148 function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
149 $fname = 'deleteLinksFromNonexistent';
150 wfWaitForSlaves( $maxLag );
151
152 $dbw = wfGetDB( DB_MASTER );
153 $dbr = wfGetDB( DB_SLAVE );
154 $dbr->bufferResults(false);
155
156 $linksTables = array(
157 'pagelinks' => 'pl_from',
158 'imagelinks' => 'il_from',
159 'categorylinks' => 'cl_from',
160 'templatelinks' => 'tl_from',
161 'externallinks' => 'el_from',
162 );
163
164
165 $readPage = $dbr->tableName( 'page' );
166 foreach ( $linksTables as $table => $field ) {
167 $readLinks = $dbr->tableName( $table );
168
169 $sql = "SELECT DISTINCT( $field ) FROM $readLinks LEFT JOIN $readPage ON $field=page_id WHERE page_id IS NULL;";
170 print "Retrieving illegal entries from $table: \tRUNNING";
171
172 $results = $dbr->query( $sql, $fname . ':' . $readLinks );
173 print "\x08\x08\x08\x08\x08\x08\x08" . $results->numRows() . " illegal " . $field. "s. ";
174
175 if ( $results->numRows() == 0 ) {
176 print "\n";
177 continue;
178 }
179
180 $counter = 0;
181 $list = array();
182 print "Removing illegal links: 1..";
183 foreach( $results as $row ) {
184 $counter++;
185 $list[] = $row->$field;
186 if ( ( $counter % $batchSize ) == 0 ) {
187 print $counter . "..";
188 deleteBatch($dbw, $table, $field, $list);
189 $list = '';
190 }
191 }
192 print $counter . "\n";
193 deleteBatch($dbw, $table, $field, $list);
194 }
195 }
196
197 /* Deletes a batch of items from a table.
198 * Runs the query: DELETE FROM <$table> WHERE <$field> IN (<$list>)
199 *
200 * @param $dbw Database Database object to run the DELETE query on
201 * @param $table table to work on; will be converted via $dbw->tableName.
202 * @param $field column to search in
203 * @param $list values to remove. Array with SQL-safe (!) values.
204 *
205 * @author Merlijn van Deen <valhallasw@arctus.nl>
206 */
207 function deleteBatch($dbw, $table, $field, $list) {
208 if (count($list) == 0) return;
209
210 $masterLinks = $dbw->tableName( $table );
211 $fname = "deleteBatch:masterLinks";
212
213 if ( !$dbw->ping() ) {
214 print "\nDB disconnected, reconnecting...";
215 while ( !$dbw->ping() ) {
216 print ".";
217 sleep(10);
218 }
219 print "\n";
220 }
221
222 $sql = "DELETE FROM $masterLinks WHERE $field IN (" . join("," , $list) . ");";
223 $dbw->query($sql, $fname);
224 }