From f51b580f0c7ee63539e16d3017f8dd0de0ee2391 Mon Sep 17 00:00:00 2001 From: Sam Reed Date: Tue, 18 Oct 2011 17:29:06 +0000 Subject: [PATCH] Kill dead/rotting importUseModWiki(pedia)? maintenance scripts --- maintenance/importUseModWiki.php | 375 ----------- maintenance/importUseModWikipedia.php | 892 -------------------------- 2 files changed, 1267 deletions(-) delete mode 100644 maintenance/importUseModWiki.php delete mode 100644 maintenance/importUseModWikipedia.php diff --git a/maintenance/importUseModWiki.php b/maintenance/importUseModWiki.php deleted file mode 100644 index a28d57a55f..0000000000 --- a/maintenance/importUseModWiki.php +++ /dev/null @@ -1,375 +0,0 @@ - - * Based loosely on Magnus's code from 2001-2002 - * - * Updated limited version to get something working temporarily - * 2003-10-09 - * Be sure to run the link & index rebuilding scripts! - * - * Some more munging for charsets etc - * 2003-11-28 - * - * Partial fix for pages starting with lowercase letters (??) - * and CamelCase and /Subpage link conversion - * 2004-11-17 - * - * Rewrite output to create Special:Export format for import - * instead of raw SQL. Should be 'future-proof' against future - * schema changes. - * 2005-03-14 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * http://www.gnu.org/copyleft/gpl.html - * - * @todo document - * @file - * @ingroup Maintenance - */ - -require_once( "Maintenance.php" ); - -class ImportUseModWiki extends Maintenance { - - private $encoding, $rootDirectory = ''; - - /** - * Field separators - * @var String - */ - private $FS1, $FS2, $FS3 = ''; - - /** - * @var Array - */ - private $usercache, $nowiki = array(); - - public function __construct() { - parent::__construct(); - $this->mDescription = "Import pages from UseMod wikis"; - $this->addOption( 'encoding', 'Encoding of the imported text, default CP1252', false, true ); - /** - * If UseModWiki's New File System is used: - * $NewFS = 1; # 1 = new multibyte $FS, 0 = old $FS - * Use "\xb3"; for the Old File System - * Changed with UTF-8 UseModWiki - * http://www.usemod.com/cgi-bin/wiki.pl?SupportForUtf8 - * http://www.usemod.com/cgi-bin/wiki.pl?WikiBugs/NewFieldSeparatorWronglyTreated - * http://www.meatballwiki.org/wiki/WikiEngine#Q_amp_A - */ - $this->addOption( 'separator', 'Field separator to use, default \x1E\xFF\xFE\x1E', false, true ); - $this->addArg( 'path', 'Path to your UseMod wiki' ); - } - - public function execute() { - $this->rootDirectory = $this->getArg(); - $this->encoding = $this->getOption( 'encoding', 'CP1252' ); - $sep = $this->getOption( 'separator', "\x1E\xFF\xFE\x1E" ); - $this->FS1 = "{$sep}1"; - $this->FS2 = "{$sep}2"; - $this->FS3 = "{$sep}3"; - - echo << - - - -XML; - $letters = array( - 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', - 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', - 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' ); - foreach ( $letters as $letter ) { - $dir = "{$this->rootDirectory}/page/$letter"; - if ( is_dir( $dir ) ) - $this->importPageDirectory( $dir ); - } - echo << - -XML; - } - - private function importPageDirectory( $dir, $prefix = "" ) { - echo "\n\n"; - $mydir = opendir( $dir ); - while ( $entry = readdir( $mydir ) ) { - $m = array(); - if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) { - echo $this->importPage( $prefix . $m[1] ); - } else { - if ( is_dir( "$dir/$entry" ) ) { - if ( $entry != '.' && $entry != '..' ) { - $this->importPageDirectory( "$dir/$entry", "$entry/" ); - } - } else { - echo "\n"; - } - } - } - } - - private function useModFilename( $title ) { - $c = substr( $title, 0, 1 ); - if ( preg_match( '/[A-Z]/i', $c ) ) { - return strtoupper( $c ) . "/$title"; - } - return "other/$title"; - } - - private function fetchPage( $title ) { - $fname = $this->rootDirectory . "/page/" . $this->useModFilename( $title ) . ".db"; - if ( !file_exists( $fname ) ) { - echo "Couldn't open file '$fname' for page '$title'.\n"; - die( -1 ); - } - - $page = $this->splitHash( $this->FS1, file_get_contents( $fname ) ); - $section = $this->splitHash( $this->FS2, $page["text_default"] ); - $text = $this->splitHash( $this->FS3, $section["data"] ); - - return $this->array2object( array( "text" => $text["text"] , "summary" => $text["summary"] , - "minor" => $text["minor"] , "ts" => $section["ts"] , - "username" => $section["username"] , "host" => $section["host"] ) ); - } - - private function fetchKeptPages( $title ) { - $fname = $this->rootDirectory . "/keep/" . $this->useModFilename( $title ) . ".kp"; - if ( !file_exists( $fname ) ) return array(); - - $keptlist = explode( $this->FS1, file_get_contents( $fname ) ); - array_shift( $keptlist ); # Drop the junk at beginning of file - - $revisions = array(); - foreach ( $keptlist as $rev ) { - $section = $this->splitHash( $this->FS2, $rev ); - $text = $this->splitHash( $this->FS3, $section["data"] ); - if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) { - array_push( $revisions, $this->array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] , - "minor" => $text["minor"] , "ts" => $section["ts"] , - "username" => $section["username"] , "host" => $section["host"] ) ) ); - } else { - echo "\n"; - } - } - return $revisions; - } - - private function splitHash( $sep , $str ) { - $temp = explode ( $sep , $str ) ; - $ret = array () ; - for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) { - $ret[$temp[$i]] = $temp[++$i] ; - } - return $ret ; - } - - private function checkUserCache( $name, $host ) { - if ( $name ) { - if ( in_array( $name, $this->usercache ) ) { - $userid = $this->usercache[$name]; - } else { - # If we haven't imported user accounts - $userid = 0; - } - $username = str_replace( '_', ' ', $name ); - } else { - $userid = 0; - $username = $host; - } - return array( $userid, $username ); - } - - private function importPage( $title ) { - echo "\n\n"; - $page = $this->fetchPage( $title ); - - $newtitle = $this->xmlsafe( str_replace( '_', ' ', $this->recodeText( $title ) ) ); - - $munged = $this->mungeFormat( $page->text ); - if ( $munged != $page->text ) { - /** - * Save a *new* revision with the conversion, and put the - * previous last version into the history. - */ - $next = $this->array2object( array( - 'text' => $munged, - 'minor' => 1, - 'username' => 'Conversion script', - 'host' => '127.0.0.1', - 'ts' => time(), - 'summary' => 'link fix', - ) ); - $revisions = array( $page, $next ); - } else { - /** - * Current revision: - */ - $revisions = array( $page ); - } - $xml = << - $newtitle - -XML; - - # History - $revisions = array_merge( $revisions, $this->fetchKeptPages( $title ) ); - if ( count( $revisions ) == 0 ) { - return NULL; // Was "$sql", which does not appear to be defined. - } - - foreach ( $revisions as $rev ) { - $text = $this->xmlsafe( $this->recodeText( $rev->text ) ); - $minor = ( $rev->minor ? '' : '' ); - list( /* $userid */ , $username ) = $this->checkUserCache( $rev->username, $rev->host ); - $username = $this->xmlsafe( $this->recodeText( $username ) ); - $timestamp = $this->xmlsafe( $this->timestamp2ISO8601( $rev->ts ) ); - $comment = $this->xmlsafe( $this->recodeText( $rev->summary ) ); - - $xml .= << - $timestamp - $username - $minor - $comment - $text - - -XML; - } - $xml .= "\n\n"; - return $xml; - } - - private function recodeText( $string ) { - # For currently latin-1 wikis - $string = str_replace( "\r\n", "\n", $string ); - $string = @iconv( $this->encoding, "UTF-8", $string ); - $string = $this->mungeToUtf8( $string ); # Any old Ӓ stuff - return $string; - } - - /** - * @todo FIXME: Don't use /e - */ - private function mungeToUtf8( $string ) { - $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string ); - $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string ); - # Should also do named entities here - return $string; - } - - private function timestamp2ISO8601( $ts ) { - # 2003-08-05T18:30:02Z - return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z'; - } - - /** - * The page may contain old data which has not been properly normalized. - * Invalid UTF-8 sequences or forbidden control characters will make our - * XML output invalid, so be sure to strip them out. - * @param String $string Text to clean up - * @return String - */ - private function xmlsafe( $string ) { - $string = UtfNormal::cleanUp( $string ); - $string = htmlspecialchars( $string ); - return $string; - } - - private function xmlCommentSafe( $text ) { - return str_replace( '--', '\\-\\-', $this->xmlsafe( $this->recodeText( $text ) ) ); - } - - private function array2object( $arr ) { - $o = (object)0; - foreach ( $arr as $x => $y ) { - $o->$x = $y; - } - return $o; - } - - /** - * Make CamelCase and /Talk links work - */ - private function mungeFormat( $text ) { - $this->nowiki = array(); - $staged = preg_replace_callback( - '/(.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s', - array( $this, 'nowikiPlaceholder' ), $text ); - - # This is probably not 100% correct, I'm just - # glancing at the UseModWiki code. - $upper = "[A-Z]"; - $lower = "[a-z_0-9]"; - $any = "[A-Za-z_0-9]"; - $camel = "(?:$upper+$lower+$upper+$any*)"; - $subpage = "(?:\\/$any+)"; - $substart = "(?:\\/$upper$any*)"; - - $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/", - '[[$1]]', $staged ); - - $final = preg_replace( '/' . preg_quote( $this->placeholder() ) . '/s', - array( $this, 'nowikiShift' ), $munged ); - return $final; - } - - private function placeholder( $x = null ) { - return '\xffplaceholder\xff'; - } - - public function nowikiPlaceholder( $matches ) { - $this->nowiki[] = $matches[1]; - return $this->placeholder(); - } - - public function nowikiShift() { - return array_shift( $this->nowiki ); - } -} - -function wfUtf8Sequence( $codepoint ) { - if ( $codepoint < 0x80 ) { - return chr( $codepoint ); - } - if ( $codepoint < 0x800 ) { - return chr( $codepoint >> 6 & 0x3f | 0xc0 ) . - chr( $codepoint & 0x3f | 0x80 ); - } - if ( $codepoint < 0x10000 ) { - return chr( $codepoint >> 12 & 0x0f | 0xe0 ) . - chr( $codepoint >> 6 & 0x3f | 0x80 ) . - chr( $codepoint & 0x3f | 0x80 ); - } - if ( $codepoint < 0x100000 ) { - return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this - chr( $codepoint >> 12 & 0x3f | 0x80 ) . - chr( $codepoint >> 6 & 0x3f | 0x80 ) . - chr( $codepoint & 0x3f | 0x80 ); - } - # Doesn't yet handle outside the BMP - return "&#$codepoint;"; -} - -$maintClass = 'ImportUseModWiki'; -require_once( RUN_MAINTENANCE_IF_MAIN ); diff --git a/maintenance/importUseModWikipedia.php b/maintenance/importUseModWikipedia.php deleted file mode 100644 index c4b8112f26..0000000000 --- a/maintenance/importUseModWikipedia.php +++ /dev/null @@ -1,892 +0,0 @@ - 983862286, - 'TexaS' => 983918410, - 'HistoryOfUnitedStatesTalk' => 984795423, - 'MetallicA' => 985128533, - 'PythagoreanTheorem' => 985225545, - 'TheCanonofScripture' => 985368223, - 'TaoTehChing' => 985368222, - //'TheMostRemarkableFormulaInTheWorld' => 985368221, - 'TheRecorder' => 985368220, - 'GladstoneOregon' => 985368219, - 'PacificBeach' => '?', - 'AaRiver' => '?', - ); - - var $replacements = array(); - - var $renameTextLinksOps = array( - 983846265 => array( - 'TestIgnore' => 'IgnoreTest', - ), - 983848080 => array( - 'UnitedLocomotiveWorks' => 'Atlas Shrugged/United Locomotive Works' - ), - 983856376 => array( - 'WikiPedia' => 'Wikipedia', - ), - 983896152 => array( - 'John_F_Kennedy' => 'John_F._Kennedy', - ), - 983905871 => array( - 'LarrySanger' => 'Larry_Sanger' - ), - 984697068 => array( - 'UnitedStates' => 'United States', - ), - 984792748 => array( - 'LibertarianisM' => 'Libertarianism' - ), - 985327832 => array( - 'AnarchisM' => 'Anarchism', - ), - 985290063 => array( - 'HistoryOfUnitedStatesDiscussion' => 'History_Of_United_States_Discussion' - ), - 985290091 => array( - 'BritishEmpire' => 'British Empire' - ), - /* - 985468958 => array( - 'ScienceFiction' => 'Science fiction', - ),*/ - ); - - /** - * Hack for observed substitution issues - */ - var $skipSelfSubstitution = array( - 'Pythagorean_Theorem', - 'The_Most_Remarkable_Formula_In_The_World', - 'Wine', - ); - - var $unixLineEndingsOps = array( - 987743732 => 'Wikipedia_FAQ' - ); - - var $replacementsDone = array(); - - var $moveLog = array(); - var $moveDests = array(); - var $revId; - - var $rc = array(); - var $textCache = array(); - var $blacklist = array(); - - var $FS, $FS1, $FS2, $FS3; - var $FreeLinkPattern, $UrlPattern, $LinkPattern, $InterLinkPattern; - - var $cp1252Table = array( -0x80 => 0x20ac, -0x81 => 0x0081, -0x82 => 0x201a, -0x83 => 0x0192, -0x84 => 0x201e, -0x85 => 0x2026, -0x86 => 0x2020, -0x87 => 0x2021, -0x88 => 0x02c6, -0x89 => 0x2030, -0x8a => 0x0160, -0x8b => 0x2039, -0x8c => 0x0152, -0x8d => 0x008d, -0x8e => 0x017d, -0x8f => 0x008f, -0x90 => 0x0090, -0x91 => 0x2018, -0x92 => 0x2019, -0x93 => 0x201c, -0x94 => 0x201d, -0x95 => 0x2022, -0x96 => 0x2013, -0x97 => 0x2014, -0x98 => 0x02dc, -0x99 => 0x2122, -0x9a => 0x0161, -0x9b => 0x203a, -0x9c => 0x0153, -0x9d => 0x009d, -0x9e => 0x017e, -0x9f => 0x0178); - - public function __construct() { - parent::__construct(); - $this->addOption( 'datadir', 'the value of $DataDir from wiki.cgi', true, true ); - $this->addOption( 'outfile', 'the name of the output XML file', true, true ); - $this->initLinkPatterns(); - - $this->encodeMap = $this->decodeMap = array(); - - for ($source = 0; $source <= 0xff; $source++) { - if ( isset( $this->cp1252Table[$source] ) ) { - $dest = $this->cp1252Table[$source]; - } else { - $dest = $source; - } - $sourceChar = chr( $source ); - $destChar = codepointToUtf8( $dest ); - $this->encodeMap[$sourceChar] = $destChar; - $this->decodeMap[$destChar] = $sourceChar; - } - } - - function initLinkPatterns() { - # Field separators are used in the URL-style patterns below. - $this->FS = "\xb3"; # The FS character is a superscript "3" - $this->FS1 = $this->FS . "1"; # The FS values are used to separate fields - $this->FS2 = $this->FS . "2"; # in stored hashtables and other data structures. - $this->FS3 = $this->FS . "3"; # The FS character is not allowed in user data. - - $UpperLetter = "[A-Z"; - $LowerLetter = "[a-z"; - $AnyLetter = "[A-Za-z"; - $AnyLetter .= "_0-9"; - $UpperLetter .= "]"; $LowerLetter .= "]"; $AnyLetter .= "]"; - - # Main link pattern: lowercase between uppercase, then anything - $LpA = $UpperLetter . "+" . $LowerLetter . "+" . $UpperLetter - . $AnyLetter . "*"; - # Optional subpage link pattern: uppercase, lowercase, then anything - $LpB = $UpperLetter . "+" . $LowerLetter . "+" . $AnyLetter . "*"; - - # Loose pattern: If subpage is used, subpage may be simple name - $this->LinkPattern = "((?:(?:$LpA)?\\/$LpB)|$LpA)"; - $QDelim = '(?:"")?'; # Optional quote delimiter (not in output) - $this->LinkPattern .= $QDelim; - - # Inter-site convention: sites must start with uppercase letter - # (Uppercase letter avoids confusion with URLs) - $InterSitePattern = $UpperLetter . $AnyLetter . "+"; - $this->InterLinkPattern = "((?:$InterSitePattern:[^\\]\\s\"<>{$this->FS}]+)$QDelim)"; - - $AnyLetter = "[-,. _0-9A-Za-z]"; - $this->FreeLinkPattern = "($AnyLetter+)"; - $this->FreeLinkPattern = "((?:(?:$AnyLetter+)?\\/)?$AnyLetter+)"; - $this->FreeLinkPattern .= $QDelim; - - # Url-style links are delimited by one of: - # 1. Whitespace (kept in output) - # 2. Left or right angle-bracket (< or >) (kept in output) - # 3. Right square-bracket (]) (kept in output) - # 4. A single double-quote (") (kept in output) - # 5. A $FS (field separator) character (kept in output) - # 6. A double double-quote ("") (removed from output) - - $UrlProtocols = "http|https|ftp|afs|news|nntp|mid|cid|mailto|wais|" - . "prospero|telnet|gopher"; - $UrlProtocols .= '|file'; - $this->UrlPattern = "((?:(?:$UrlProtocols):[^\\]\\s\"<>{$this->FS}]+)$QDelim)"; - $ImageExtensions = "(gif|jpg|png|bmp|jpeg)"; - $RFCPattern = "RFC\\s?(\\d+)"; - $ISBNPattern = "ISBN:?([0-9- xX]{10,})"; - } - - function execute() { - $this->articleFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp'; - $this->patchFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp'; - $this->dataDir = $this->getOption( 'datadir' ); - $this->outFile = fopen( $this->getOption( 'outfile' ), 'w' ); - if ( !$this->outFile ) { - echo "Unable to open output file\n"; - return 1; - } - $this->writeXmlHeader(); - $this->readRclog(); - $this->writeMoveLog(); - $this->writeRevisions(); - $this->reconcileCurrentRevs(); - $this->writeXmlFooter(); - unlink( $this->articleFileName ); - unlink( $this->patchFileName ); - return 0; - } - - function writeXmlHeader() { - fwrite( $this->outFile, << - - Wikipedia - http://www.wikipedia.com/ - MediaWiki 1.18alpha importUseModWikipedia.php - case-sensitive - - - - - -EOT - ); - } - - function writeXmlFooter() { - fwrite( $this->outFile, "\n" ); - } - - function readRclog() { - $rcFile = fopen( "{$this->dataDir}/rclog", 'r' ); - while ( $line = fgets( $rcFile ) ) { - $bits = explode( $this->FS3, $line ); - if ( count( $bits ) !== 7 ) { - echo "Error reading rclog\n"; - return; - } - $params = array( - 'timestamp' => $bits[0], - 'rctitle' => $bits[1], - 'summary' => $bits[2], - 'minor' => $bits[3], - 'host' => $bits[4], - 'kind' => $bits[5], - 'extra' => array() - ); - $extraList = explode( $this->FS2, $bits[6] ); - - for ( $i = 0; $i < count( $extraList ); $i += 2 ) { - $params['extra'][$extraList[$i]] = $extraList[$i + 1]; - } - $this->rc[$params['timestamp']][] = $params; - } - } - - function writeMoveLog() { - $this->moveLog = array(); - $deepRenames = $this->deepRenames; - echo "Calculating move log...\n"; - $this->processDiffFile( array( $this, 'moveLogCallback' ) ); - - // We have the timestamp intervals, now make a guess at the actual timestamp - foreach ( $this->moveLog as $newTitle => $params ) { - // Is there a time specified? - $drTime = false; - if ( isset( $deepRenames[$params['old']] ) ) { - $drTime = $deepRenames[$params['old']]; - if ( $drTime !== '?' ) { - if ( ( !isset( $params['endTime'] ) || $drTime < $params['endTime'] ) - && $drTime > $params['startTime'] ) - { - $this->moveLog[$newTitle]['timestamp'] = $drTime; - $this->moveLog[$newTitle]['deep'] = true; - - echo "{$params['old']} -> $newTitle at $drTime\n"; - unset( $deepRenames[$params['old']] ); - continue; - } else { - echo "WARNING: deep rename time invalid: {$params['old']}\n"; - unset( $deepRenames[$params['old']] ); - } - } - } - - // Guess that it is one second after the last edit to the page before it was moved - $this->moveLog[$newTitle]['timestamp'] = $params['startTime'] + 1; - if ( $drTime === '?' ) { - $this->moveLog[$newTitle]['deep'] = true; - unset( $deepRenames[$params['old']] ); - } - if ( isset( $params['endTime'] ) ) { - $this->printLatin1( "{$params['old']} -> $newTitle between " . - "{$params['startTime']} and {$params['endTime']}\n" ); - } else { - $this->printLatin1( "{$params['old']} -> $newTitle after " . - "{$params['startTime']}\n" ); - } - } - - // Write the move log to the XML file - $id = 1; - foreach ( $this->moveLog as $newTitle => $params ) { - $out = "\n" . - $this->element( 'id', $id++ ) . - $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) . - "\n" . - $this->element( 'username', 'UseModWiki admin' ) . - "" . - $this->element( 'type', 'move' ) . - $this->element( 'action', 'move' ) . - $this->element( 'logtitle', $params['old'] ) . - "" . - htmlspecialchars( $this->encode( "{$newTitle}\n1" ) ) . - "\n" . - "\n"; - fwrite( $this->outFile, $out ); - } - - // Check for remaining deep rename entries - if ( $deepRenames ) { - echo "WARNING: the following entries in \$this->deepRenames are " . - "invalid, since no such move exists:\n" . - implode( "\n", array_keys( $deepRenames ) ) . - "\n\n"; - } - - } - - function element( $name, $value ) { - return "<$name>" . htmlspecialchars( $this->encode( $value ) ) . "\n"; - } - - function moveLogCallback( $entry ) { - $rctitle = $entry['rctitle']; - $title = $entry['title']; - $this->moveDests[$rctitle] = $title; - - if ( $rctitle === $title ) { - if ( isset( $this->moveLog[$rctitle] ) - && !isset( $this->moveLog[$rctitle]['endTime'] ) ) - { - // This is the latest time that the page could have been moved - $this->moveLog[$rctitle]['endTime'] = $entry['timestamp']; - } - } else { - if ( !isset( $this->moveLog[$rctitle] ) ) { - // Initialise the move log entry - $this->moveLog[$rctitle] = array( - 'old' => $title - ); - } - // Update the earliest time the page could have been moved - $this->moveLog[$rctitle]['startTime'] = $entry['timestamp']; - } - } - - function writeRevisions() { - $this->numGoodRevs = 0; - $this->revId = 1; - $this->processDiffFile( array( $this, 'revisionCallback' ) ); - echo "\n\nImported {$this->numGoodRevs} out of {$this->numRevs}\n"; - } - - function revisionCallback( $params ) { - $title = $params['rctitle']; - $editTime = $params['timestamp']; - - if ( isset( $this->blacklist[$title] ) ) { - return; - } - $this->doPendingOps( $editTime ); - - $origText = $this->getText( $title ); - $text = $this->patch( $origText, $params['diff'] ); - if ( $text === false ) { - echo "$editTime $title attempting resolution...\n"; - $linkSubstitutes = $this->resolveFailedDiff( $origText, $params['diff'] ); - if ( !$linkSubstitutes ) { - $this->printLatin1( "$editTime $title DIFF FAILED\n" ); - $this->blacklist[$title] = true; - return; - } - $this->printLatin1( "$editTime $title requires substitutions:\n" ); - $time = $editTime - 1; - foreach ( $linkSubstitutes as $old => $new ) { - $this->printLatin1( "SUBSTITUTE $old -> $new\n" ); - $this->renameTextLinks( $old, $new, $time-- ); - } - $origText = $this->getText( $title ); - $text = $this->patch( $origText, $params['diff'] ); - if ( $text === false ) { - $this->printLatin1( "$editTime $title STILL FAILS!\n" ); - $this->blacklist[$title] = true; - return; - } - - echo "\n"; - } - - $params['text'] = $text; - $this->saveRevision( $params ); - $this->numGoodRevs++; - #$this->printLatin1( "$editTime $title\n" ); - } - - function doPendingOps( $editTime ) { - foreach ( $this->moveLog as $newTitle => $entry ) { - if ( $entry['timestamp'] <= $editTime ) { - unset( $this->moveLog[$newTitle] ); - if ( isset( $entry['deep'] ) ) { - $this->renameTextLinks( $entry['old'], $newTitle, $entry['timestamp'] ); - } - } - } - - foreach ( $this->renameTextLinksOps as $renameTime => $replacements ) { - if ( $editTime >= $renameTime ) { - foreach ( $replacements as $old => $new ) { - $this->printLatin1( "SUBSTITUTE $old -> $new\n" ); - $this->renameTextLinks( $old, $new, $renameTime ); - } - unset( $this->renameTextLinksOps[$renameTime] ); - } - } - - foreach ( $this->unixLineEndingsOps as $fixTime => $title ) { - if ( $editTime >= $fixTime ) { - $this->printLatin1( "$fixTime $title FIXING LINE ENDINGS\n" ); - $text = $this->getText( $title ); - $text = str_replace( "\r", '', $text ); - $this->saveRevision( array( - 'rctitle' => $title, - 'timestamp' => $fixTime, - 'extra' => array( 'name' => 'UseModWiki admin' ), - 'text' => $text, - 'summary' => 'Fixing line endings', - ) ); - unset( $this->unixLineEndingsOps[$fixTime] ); - } - } - } - - function patch( $source, $diff ) { - file_put_contents( $this->articleFileName, $source ); - file_put_contents( $this->patchFileName, $diff ); - $error = wfShellExec( - wfEscapeShellArg( - 'patch', - '-n', - '-r', '-', - '--no-backup-if-mismatch', - '--binary', - $this->articleFileName, - $this->patchFileName - ) . ' 2>&1', - $status - ); - $text = file_get_contents( $this->articleFileName ); - if ( $status || $text === false ) { - return false; - } else { - return $text; - } - } - - function resolveFailedDiff( $origText, $diff ) { - $context = array(); - $diffLines = explode( "\n", $diff ); - for ( $i = 0; $i < count( $diffLines ); $i++ ) { - $diffLine = $diffLines[$i]; - if ( !preg_match( '/^(\d+)(?:,\d+)?[acd]\d+(?:,\d+)?$/', $diffLine, $m ) ) { - continue; - } - - $sourceIndex = intval( $m[1] ); - $i++; - while ( $i < count( $diffLines ) && substr( $diffLines[$i], 0, 1 ) === '<' ) { - $context[$sourceIndex - 1] = substr( $diffLines[$i], 2 ); - $sourceIndex++; - $i++; - } - $i--; - } - - $changedLinks = array(); - $origLines = explode( "\n", $origText ); - foreach ( $context as $i => $contextLine ) { - $origLine = isset( $origLines[$i] ) ? $origLines[$i] : ''; - if ( $contextLine === $origLine ) { - continue; - } - $newChanges = $this->resolveTextChange( $origLine, $contextLine ); - if ( is_array( $newChanges ) ) { - $changedLinks += $newChanges; - } else { - echo "Resolution failure on line " . ( $i + 1 ) . "\n"; - $this->printLatin1( $newChanges ); - } - } - - return $changedLinks; - } - - function resolveTextChange( $source, $dest ) { - $changedLinks = array(); - $sourceLinks = $this->getLinkList( $source ); - $destLinks = $this->getLinkList( $dest ); - $newLinks = array_diff( $destLinks, $sourceLinks ); - $removedLinks = array_diff( $sourceLinks, $destLinks ); - - // Match up the removed links with the new links - foreach ( $newLinks as $newLink ) { - $minDistance = 100000000; - $bestRemovedLink = false; - foreach ( $removedLinks as $removedLink ) { - $editDistance = levenshtein( $newLink, $removedLink ); - if ( $editDistance < $minDistance ) { - $minDistance = $editDistance; - $bestRemovedLink = $removedLink; - } - } - if ( $bestRemovedLink !== false ) { - $changedLinks[$bestRemovedLink] = $newLink; - $newLinks = array_diff( $newLinks, array( $newLink ) ); - $removedLinks = array_diff( $removedLinks, array( $bestRemovedLink ) ); - } - } - - $proposal = $source; - foreach ( $changedLinks as $removedLink => $newLink ) { - $proposal = $this->substituteTextLinks( $removedLink, $newLink, $proposal ); - } - if ( $proposal !== $dest ) { - // Resolution failed - $msg = "Source line: $source\n" . - "Source links: " . implode( ', ', $sourceLinks ) . "\n" . - "Context line: $dest\n" . - "Context links: " . implode( ', ', $destLinks ) . "\n" . - "Proposal: $proposal\n"; - return $msg; - } - return $changedLinks; - } - - function processDiffFile( $callback ) { - $diffFile = fopen( "{$this->dataDir}/diff_log", 'r' ); - - $delimiter = "------\n"; - file_put_contents( $this->articleFileName, "Describe the new page here.\n" ); - - $line = fgets( $diffFile ); - $lineNum = 1; - if ( $line !== $delimiter ) { - echo "Invalid diff file\n"; - return false; - } - $lastReportLine = 0; - $this->numRevs = 0; - - while ( true ) { - $line = fgets( $diffFile ); - $lineNum++; - if ( $line === false ) { - break; - } - if ( $lineNum > $lastReportLine + 1000 ) { - $lastReportLine = $lineNum; - fwrite( STDERR, "$lineNum \r" ); - fflush( STDERR ); - } - $line = trim( $line ); - if ( !preg_match( '/^([^|]+)\|(\d+)$/', $line, $matches ) ) { - echo "Invalid header on line $lineNum\n"; - return true; - } - list( , $title, $editTime ) = $matches; - - $diff = ''; - $diffStartLine = $lineNum; - while ( true ) { - $line = fgets( $diffFile ); - $lineNum++; - if ( $line === $delimiter ) { - break; - } - if ( $line === false ) { - break 2; - } - $diff .= $line; - } - - $this->numRevs++; - - if ( !isset( $this->rc[$editTime] ) ) { - $this->printLatin1( "$editTime $title DELETED, skipping\n" ); - continue; - } - - if ( count( $this->rc[$editTime] ) == 1 ) { - $params = $this->rc[$editTime][0]; - } else { - $params = false; - $candidates = ''; - foreach ( $this->rc[$editTime] as $rc ) { - if ( $rc['rctitle'] === $title ) { - $params = $rc; - break; - } - if ( $candidates === '' ) { - $candidates = $rc['rctitle']; - } else { - $candidates .= ', ' . $rc['rctitle']; - } - } - if ( !$params ) { - $this->printLatin1( "$editTime $title ERROR cannot resolve rclog\n" ); - $this->printLatin1( "$editTime $title CANDIDATES: $candidates\n" ); - continue; - } - } - $params['diff'] = $diff; - $params['title'] = $title; - $params['diffStartLine'] = $diffStartLine; - call_user_func( $callback, $params ); - } - echo "\n"; - - if ( !feof( $diffFile ) ) { - echo "Stopped at line $lineNum\n"; - } - return true; - } - - function reconcileCurrentRevs() { - foreach ( $this->textCache as $title => $text ) { - $fileName = "{$this->dataDir}/page/"; - if ( preg_match( '/^[A-Z]/', $title, $m ) ) { - $fileName .= $m[0]; - } else { - $fileName .= 'other'; - } - $fileName .= "/$title.db"; - - if ( !file_exists( $fileName ) ) { - $this->printLatin1( "ERROR: Cannot find page file for {$title}\n" ); - continue; - } - - $fileContents = file_get_contents( $fileName ); - $page = $this->unserializeUseMod( $fileContents, $this->FS1 ); - $section = $this->unserializeUseMod( $page['text_default'], $this->FS2 ); - $data = $this->unserializeUseMod( $section['data'], $this->FS3 ); - $pageText = $data['text']; - if ( $text !== $pageText ) { - $substs = $this->resolveTextChange( $text, $pageText ); - if ( is_array( $substs ) ) { - foreach ( $substs as $source => $dest ) { - if ( isset( $this->moveLog[$dest] ) ) { - $this->printLatin1( "ERROR: need deep rename: $source\n" ); - } else { - $this->printLatin1( "ERROR: need substitute: $source -> $dest\n" ); - } - } - } else { - $this->printLatin1( "ERROR: unresolved diff in $title:\n" ); - wfSuppressWarnings(); - $diff = xdiff_string_diff( $text, $pageText ) . ''; - wfRestoreWarnings(); - $this->printLatin1( "$diff\n" ); - } - } - } - } - - function makeTitle( $titleText ) { - return Title::newFromText( $this->encode( $titleText ) ); - } - - function getText( $titleText ) { - if ( !isset( $this->textCache[$titleText] ) ) { - return "Describe the new page here.\n"; - } else { - return $this->textCache[$titleText]; - } - } - - function saveRevision( $params ) { - $this->textCache[$params['rctitle']] = $params['text']; - - $out = "\n" . - $this->element( 'title', $params['rctitle'] ) . - "\n" . - $this->element( 'id', $this->revId ++ ) . - $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) . - "\n"; - if ( isset( $params['extra']['name'] ) ) { - $out .= $this->element( 'username', $params['extra']['name'] ); - } - if ( isset( $params['extra']['id'] ) ) { - $out .= $this->element( 'id', $params['extra']['id'] ); - } - if ( isset( $params['host'] ) ) { - $out .= $this->element( 'ip', $params['host'] ); - } - $out .= - "\n" . - $this->element( 'comment', $params['summary'] ) . - "" . - htmlspecialchars( $this->encode( $params['text'] ) ) . - "\n" . - "\n" . - "\n"; - fwrite( $this->outFile, $out ); - } - - function renameTextLinks( $old, $new, $timestamp ) { - $newWithUnderscores = $new; - $old = str_replace( '_', ' ', $old ); - $new = str_replace( '_', ' ', $new ); - - foreach ( $this->textCache as $title => $oldText ) { - if ( $newWithUnderscores === $title - && in_array( $title, $this->skipSelfSubstitution ) ) - { - // Hack to make Pythagorean_Theorem etc. work - continue; - } - - $newText = $this->substituteTextLinks( $old, $new, $oldText ); - if ( $oldText !== $newText ) { - $this->saveRevision( array( - 'rctitle' => $title, - 'timestamp' => $timestamp, - 'text' => $newText, - 'extra' => array( 'name' => 'Page move link fixup script' ), - 'summary' => '', - 'minor' => true - ) ); - } - } - } - - function substituteTextLinks( $old, $new, $text ) { - $this->saveUrl = array(); - $this->old = $old; - $this->new = $new; - - $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia) - $text = preg_replace_callback( '/(
(.*?)<\/pre>)/is', 
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( '/((.*?)<\/code>)/is', 
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( '/((.*?)<\/nowiki>)/s', 
-			array( $this, 'storeRaw' ), $text );
-
-		$text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/",
-			array( $this, 'subFreeLink' ), $text );
-		$text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/",
-			array( $this, 'subFreeLink' ), $text );
-		$text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/", 
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/", 
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/", 
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/",
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( "/{$this->LinkPattern}/", 
-			array( $this, 'subWikiLink' ), $text );
-
-		$text = preg_replace_callback( "/{$this->FS}(\d+){$this->FS}/", 
-			array( $this, 'restoreRaw' ), $text );   # Restore saved text
-		return $text;
-	}
-
-	function getLinkList( $text ) {
-		$this->saveUrl = array();
-		$this->linkList = array();
-
-		$text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia)
-		$text = preg_replace_callback( '/(
(.*?)<\/pre>)/is', 
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( '/((.*?)<\/code>)/is', 
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( '/((.*?)<\/nowiki>)/s', 
-			array( $this, 'storeRaw' ), $text );
-
-		$text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/",
-			array( $this, 'storeLink' ), $text );
-		$text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/",
-			array( $this, 'storeLink' ), $text );
-		$text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/", 
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/", 
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/", 
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/",
-			array( $this, 'storeRaw' ), $text );
-		$text = preg_replace_callback( "/{$this->LinkPattern}/", 
-			array( $this, 'storeLink' ), $text );
-
-		return $this->linkList;
-	}
-
-	function storeRaw( $m ) {
-		$this->saveUrl[] = $m[1];
-		return $this->FS . (count( $this->saveUrl ) - 1) . $this->FS;
-	}
-
-	function subFreeLink( $m ) {
-		$link = $m[1];
-		if ( isset( $m[2] ) ) {
-			$name = $m[2];
-		} else {
-			$name = '';
-		}
-		$oldlink = $link;
-		$link = preg_replace( '/^\s+/', '', $link );
-		$link = preg_replace( '/\s+$/', '', $link );
-		if ( $link == $this->old ) {
-			$link = $this->new;
-		} else {
-			$link = $oldlink;  # Preserve spaces if no match
-		}
-		$link = "[[$link";
-		if ( $name !== "" ) {
-			$link .= "|$name";
-		}
-		$link .= "]]";
-		return $this->storeRaw( array( 1 => $link ) );
-	}
-
-	function subWikiLink( $m ) {
-		$link = $m[1];
-		if ( $link == $this->old ) {
-			$link = $this->new;
-			if ( !preg_match( "/^{$this->LinkPattern}$/", $this->new ) ) {
-				$link = "[[$link]]";
-			}
-		}
-		return $this->storeRaw( array( 1 => $link ) );
-	}
-
-	function restoreRaw( $m ) {
-		return $this->saveUrl[$m[1]];
-	}
-
-	function storeLink( $m ) {
-		$this->linkList[] = $m[1];
-		return $this->storeRaw( $m );
-	}
-
-	function encode( $s ) {
-		return strtr( $s, $this->encodeMap );
-	}
-
-	function decode( $s ) {
-		return strtr( $s, $this->decodeMap );
-	}
-
-	function printLatin1( $s ) {
-		echo $this->encode( $s );
-	}
-
-	function unserializeUseMod( $s, $sep ) {
-		$parts = explode( $sep, $s );
-		$result = array();
-		for ( $i = 0; $i < count( $parts ); $i += 2 ) {
-			$result[$parts[$i]] = $parts[$i+1];
-		}
-		return $result;
-	}
-}
-
-$maintClass = 'ImportUseModWikipedia';
-require_once( RUN_MAINTENANCE_IF_MAIN );
-- 
2.20.1