From: Tim Starling Date: Thu, 26 Jan 2006 13:29:14 +0000 (+0000) Subject: * Added externallinks table, to track links to arbitrary URLs X-Git-Tag: 1.6.0~419 X-Git-Url: http://git.cyclocoop.org/%24href?a=commitdiff_plain;h=eb53cc08560721208e195c0f073809e7b3eee485;p=lhc%2Fweb%2Fwiklou.git * Added externallinks table, to track links to arbitrary URLs * Convert unnecessary URL escape codes in external links to their equivalent character before doing anything with them. This prevents certain kinds of spam filter evasion. (Parser.php only) --- diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 762f8d0bc4..c3fd72c5b3 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -64,6 +64,7 @@ Database: namespace are changed * Respect database prefix in dumpHTML.inc * Removed read-only check from Database::query() +* Added externallinks table, to track links to arbitrary URLs Documentation: * (bug 3306) Document $wgLocalTZoffset @@ -251,6 +252,9 @@ Parser: * Fix XML validity checks in parser tests on PHP 5.1 * (bug 4377) "[" is not valid in URLs * (bug 4453) fix for __TOC__ dollar-number breakage +* Convert unnecessary URL escape codes in external links to their equivalent + character before doing anything with them. This prevents certain kinds of + spam filter evasion. Upload: * (bug 2527) Always set destination filename when new file is selected diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php index 6454389bb8..a3f5b4a2d7 100644 --- a/includes/GlobalFunctions.php +++ b/includes/GlobalFunctions.php @@ -1808,4 +1808,34 @@ function wfBaseName( $path ) { } } +/** + * Make a URL index, appropriate for the el_index field of externallinks. + */ +function wfMakeUrlIndex( $url ) { + wfSuppressWarnings(); + $bits = parse_url( $url ); + wfRestoreWarnings(); + if ( !$bits || $bits['scheme'] !== 'http' ) { + return false; + } + // Reverse the labels in the hostname, convert to lower case + $reversedHost = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) ); + // Add an extra dot to the end + if ( substr( $reversedHost, -1 ) !== '.' ) { + $reversedHost .= '.'; + } + // Reconstruct the pseudo-URL + $index = "http://$reversedHost"; + // Leave out user and password. Add the port, path, query and fragment + if ( isset( $bits['port'] ) ) $index .= ':' . $bits['port']; + if ( isset( $bits['path'] ) ) { + $index .= $bits['path']; + } else { + $index .= '/'; + } + if ( isset( $bits['query'] ) ) $index .= '?' . $bits['query']; + if ( isset( $bits['fragment'] ) ) $index .= '#' . $bits['fragment']; + return $index; +} + ?> diff --git a/includes/LinksUpdate.php b/includes/LinksUpdate.php index be61ebdd3c..f909c35d97 100644 --- a/includes/LinksUpdate.php +++ b/includes/LinksUpdate.php @@ -19,6 +19,7 @@ class LinksUpdate { $mLinks, # Map of title strings to IDs for the links in the document $mImages, # DB keys of the images used, in the array key only $mTemplates, # Map of title strings to IDs for the template references, including broken ones + $mExternals, # URLs of external links, array key only $mCategories, # Map of category names to sort keys $mDb, # Database connection reference $mOptions; # SELECT options to be used (array) @@ -52,6 +53,7 @@ class LinksUpdate { $this->mLinks =& $this->mParserOutput->getLinks(); $this->mImages =& $this->mParserOutput->getImages(); $this->mTemplates =& $this->mParserOutput->getTemplates(); + $this->mExternals =& $this->mParserOutput->getExternalLinks(); $this->mCategories =& $this->mParserOutput->getCategories(); } @@ -87,6 +89,11 @@ class LinksUpdate { $this->incrTableUpdate( 'imagelinks', 'il', $this->getImageDeletions( $existing ), $this->getImageInsertions( $existing ) ); + # External links + $existing = $this->getExistingExternals(); + $this->incrTableUpdate( 'externallinks', 'el', $this->getExternalDeletions( $existing ), + $this->getExternalInsertions( $existing ) ); + # Category links $existing = $this->getExistingCategories(); $this->incrTableUpdate( 'categorylinks', 'cl', $this->getCategoryDeletions( $existing ), @@ -117,6 +124,7 @@ class LinksUpdate { $this->dumbTableUpdate( 'imagelinks', $this->getImageInsertions(), 'il_from' ); $this->dumbTableUpdate( 'categorylinks', $this->getCategoryInsertions(), 'cl_from' ); $this->dumbTableUpdate( 'templatelinks', $this->getTemplateInsertions(), 'tl_from' ); + $this->dumbTableUpdate( 'externallinks', $this->getExternalInsertions(), 'el_from' ); # Update the cache of all the category pages $this->invalidateCategories( $categoryUpdates ); @@ -238,7 +246,7 @@ class LinksUpdate { function getImageInsertions( $existing = array() ) { $arr = array(); $diffs = array_diff_key( $this->mImages, $existing ); - foreach( $diffs as $iname => $val ) { + foreach( $diffs as $iname => $dummy ) { $arr[] = array( 'il_from' => $this->mId, 'il_to' => $iname @@ -247,6 +255,23 @@ class LinksUpdate { return $arr; } + /** + * Get an array of externallinks insertions. Skips the names specified in $existing + * @access private + */ + function getExternalInsertions( $existing = array() ) { + $arr = array(); + $diffs = array_diff_key( $this->mExternals, $existing ); + foreach( $diffs as $url => $dummy ) { + $arr[] = array( + 'el_from' => $this->mId, + 'el_to' => $url, + 'el_index' => wfMakeUrlIndex( $url ), + ); + } + return $arr; + } + /** * Get an array of category insertions * @param array $existing Array mapping existing category names to sort keys. If both @@ -309,6 +334,15 @@ class LinksUpdate { return array_diff_key( $existing, $this->mImages ); } + /** + * Given an array of existing external links, returns those links which are not + * in $this and thus should be deleted. + * @access private + */ + function getExternalDeletions( $existing ) { + return array_diff_key( $existing, $this->mExternals ); + } + /** * Given an array of existing categories, returns those categories which are not in $this * and thus should be deleted. @@ -333,6 +367,7 @@ class LinksUpdate { } $arr[$row->pl_namespace][$row->pl_title] = 1; } + $this->mDb->freeResult( $res ); return $arr; } @@ -351,6 +386,7 @@ class LinksUpdate { } $arr[$row->tl_namespace][$row->tl_title] = 1; } + $this->mDb->freeResult( $res ); return $arr; } @@ -366,6 +402,23 @@ class LinksUpdate { while ( $row = $this->mDb->fetchObject( $res ) ) { $arr[$row->il_to] = 1; } + $this->mDb->freeResult( $res ); + return $arr; + } + + /** + * Get an array of existing external links, URLs in the keys + * @access private + */ + function getExistingExternals() { + $fname = 'LinksUpdate::getExistingExternals'; + $res = $this->mDb->select( 'externallinks', array( 'el_to' ), + array( 'el_from' => $this->mId ), $fname, $this->mOptions ); + $arr = array(); + while ( $row = $this->mDb->fetchObject( $res ) ) { + $arr[$row->el_to] = 1; + } + $this->mDb->freeResult( $res ); return $arr; } @@ -381,6 +434,7 @@ class LinksUpdate { while ( $row = $this->mDb->fetchObject( $res ) ) { $arr[$row->cl_to] = $row->cl_sortkey; } + $this->mDb->freeResult( $res ); return $arr; } } diff --git a/includes/Parser.php b/includes/Parser.php index 00d2d4eb28..396803475d 100644 --- a/includes/Parser.php +++ b/includes/Parser.php @@ -1121,19 +1121,23 @@ class Parser # Replace & from obsolete syntax with &. # All HTML entities will be escaped by makeExternalLink() - # or maybeMakeExternalImage() $url = str_replace( '&', '&', $url ); + # Replace unnecessary URL escape codes with the referenced character + # This prevents spammers from hiding links from the filters + $url = Parser::replaceUnusualEscapes( $url ); # Process the trail (i.e. everything after this link up until start of the next link), # replacing any non-bracketed links $trail = $this->replaceFreeExternalLinks( $trail ); - # Use the encoded URL # This means that users can paste URLs directly into the text # Funny characters like ö aren't valid in URLs anyway # This was changed in August 2004 $s .= $sk->makeExternalLink( $url, $text, false, $linktype ) . $dtrail . $trail; + + # Register link in the output object + $this->mOutput->addExternalLink( $url ); } wfProfileOut( $fname ); @@ -1189,12 +1193,16 @@ class Parser # All HTML entities will be escaped by makeExternalLink() # or maybeMakeExternalImage() $url = str_replace( '&', '&', $url ); + # Replace unnecessary URL escape codes with their equivalent characters + $url = Parser::replaceUnusualEscapes( $url ); # Is this an external image? $text = $this->maybeMakeExternalImage( $url ); if ( $text === false ) { # Not an image, make a link $text = $sk->makeExternalLink( $url, $wgContLang->markNoConversion($url), true, 'free' ); + # Register it in the output object + $this->mOutput->addExternalLink( $url ); } $s .= $text . $trail; } else { @@ -1205,6 +1213,36 @@ class Parser return $s; } + /** + * Replace unusual URL escape codes with their equivalent characters + * @param string + * @return string + * @static + */ + function replaceUnusualEscapes( $url ) { + return preg_replace_callback( '/%[0-9A-Fa-f]{2}/', + array( 'Parser', 'replaceUnusualEscapesCallback' ), $url ); + } + + /** + * Callback function used in replaceUnusualEscapes(). + * Replaces unusual URL escape codes with their equivalent character + * @static + * @access private + */ + function replaceUnusualEscapesCallback( $matches ) { + $char = urldecode( $matches[0] ); + $ord = ord( $char ); + // Is it an unsafe or HTTP reserved character according to RFC 1738? + if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) { + // No, shouldn't be escaped + return $char; + } else { + // Yes, leave it escaped + return $matches[0]; + } + } + /** * make an image if it's allowed, either through the global * option or through the exception @@ -3742,7 +3780,8 @@ class ParserOutput $mTitleText, # title text of the chosen language variant $mLinks, # 2-D map of NS/DBK to ID for the links in the document. ID=zero for broken. $mTemplates, # 2-D map of NS/DBK to ID for the template references. ID=zero for broken. - $mImages; # DB keys of the images used, in the array key only + $mImages, # DB keys of the images used, in the array key only + $mExternalLinks; # External link URLs, in the key only function ParserOutput( $text = '', $languageLinks = array(), $categoryLinks = array(), $containsOldMagic = false, $titletext = '' ) @@ -3757,6 +3796,7 @@ class ParserOutput $this->mLinks = array(); $this->mTemplates = array(); $this->mImages = array(); + $this->mExternalLinks = array(); } function getText() { return $this->mText; } @@ -3768,6 +3808,7 @@ class ParserOutput function &getLinks() { return $this->mLinks; } function &getTemplates() { return $this->mTemplates; } function &getImages() { return $this->mImages; } + function &getExternalLinks() { return $this->mExternalLinks; } function containsOldMagic() { return $this->mContainsOldMagic; } function setText( $text ) { return wfSetVar( $this->mText, $text ); } @@ -3780,6 +3821,7 @@ class ParserOutput function addCategory( $c, $sort ) { $this->mCategories[$c] = $sort; } function addImage( $name ) { $this->mImages[$name] = 1; } function addLanguageLink( $t ) { $this->mLanguageLinks[] = $t; } + function addExternalLink( $url ) { $this->mExternalLinks[$url] = 1; } function addLink( $title, $id ) { $ns = $title->getNamespace(); diff --git a/maintenance/archives/patch-externallinks.sql b/maintenance/archives/patch-externallinks.sql new file mode 100644 index 0000000000..d1aa5764f0 --- /dev/null +++ b/maintenance/archives/patch-externallinks.sql @@ -0,0 +1,13 @@ +-- +-- Track links to external URLs +-- +CREATE TABLE /*$wgDBprefix*/externallinks ( + el_from int(8) unsigned NOT NULL default '0', + el_to blob NOT NULL default '', + el_index blob NOT NULL default '', + + KEY (el_from, el_to(40)), + KEY (el_to(60), el_from), + KEY (el_index(60)) +) TYPE=InnoDB; + diff --git a/maintenance/mysql5/tables.sql b/maintenance/mysql5/tables.sql index d09a67157d..f08e39ffd8 100644 --- a/maintenance/mysql5/tables.sql +++ b/maintenance/mysql5/tables.sql @@ -462,6 +462,34 @@ CREATE TABLE /*$wgDBprefix*/categorylinks ( ) TYPE=InnoDB, DEFAULT CHARSET=utf8; +-- +-- Track links to external URLs +-- +CREATE TABLE /*$wgDBprefix*/externallinks ( + -- page_id of the referring page + el_from int(8) unsigned NOT NULL default '0', + + -- The URL + el_to blob NOT NULL default '', + + -- In the case of HTTP URLs, this is the URL with any username or password + -- removed, and with the labels in the hostname reversed and converted to + -- lower case. An extra dot is added to allow for matching of either + -- example.com or *.example.com in a single scan. + -- Example: + -- http://user:password@sub.example.com/page.html + -- becomes + -- http://com.example.sub./page.html + -- which allows for fast searching for all pages under example.com with the + -- clause: + -- WHERE el_index LIKE 'http://com.example.%' + el_index blob NOT NULL default '', + + KEY (el_from, el_to(40)), + KEY (el_to(60), el_from), + KEY (el_index(60)) +) TYPE=InnoDB, DEFAULT CHARSET=utf8; + -- -- Contains a single row with some aggregate info -- on the state of the site. diff --git a/maintenance/refreshLinks.inc b/maintenance/refreshLinks.inc index 04125c12df..fa3c9e64f5 100644 --- a/maintenance/refreshLinks.inc +++ b/maintenance/refreshLinks.inc @@ -69,7 +69,7 @@ function refreshLinks( $start, $newOnly = false, $maxLag = false, $end = 0 ) { } function fixLinksFromArticle( $id ) { - global $wgTitle, $wgArticle, $wgOut, $wgParser, $wgLinkCache; + global $wgTitle, $wgArticle, $wgOut, $wgParser; $wgTitle = Title::newFromID( $id ); $dbw =& wfGetDB( DB_MASTER ); @@ -105,6 +105,8 @@ function deleteLinksFromNonexistent( $maxLag = 0 ) { 'pagelinks' => 'pl_from', 'imagelinks' => 'il_from', 'categorylinks' => 'cl_from', + 'templatelinks' => 'tl_from', + 'externallinks' => 'el_from', ); $page = $dbw->tableName( 'page' ); diff --git a/maintenance/tables.sql b/maintenance/tables.sql index 8a08553d7a..2aa58c4713 100644 --- a/maintenance/tables.sql +++ b/maintenance/tables.sql @@ -449,6 +449,34 @@ CREATE TABLE /*$wgDBprefix*/categorylinks ( ) TYPE=InnoDB; +-- +-- Track links to external URLs +-- +CREATE TABLE /*$wgDBprefix*/externallinks ( + -- page_id of the referring page + el_from int(8) unsigned NOT NULL default '0', + + -- The URL + el_to blob NOT NULL default '', + + -- In the case of HTTP URLs, this is the URL with any username or password + -- removed, and with the labels in the hostname reversed and converted to + -- lower case. An extra dot is added to allow for matching of either + -- example.com or *.example.com in a single scan. + -- Example: + -- http://user:password@sub.example.com/page.html + -- becomes + -- http://com.example.sub./page.html + -- which allows for fast searching for all pages under example.com with the + -- clause: + -- WHERE el_index LIKE 'http://com.example.%' + el_index blob NOT NULL default '', + + KEY (el_from, el_to(40)), + KEY (el_to(60), el_from), + KEY (el_index(60)) +) TYPE=InnoDB; + -- -- Contains a single row with some aggregate info -- on the state of the site. diff --git a/maintenance/updaters.inc b/maintenance/updaters.inc index aeec62f4e0..d53b429cec 100644 --- a/maintenance/updaters.inc +++ b/maintenance/updaters.inc @@ -26,6 +26,7 @@ $wgNewTables = array( array( 'user_newtalk', 'patch-usernewtalk2.sql' ), array( 'transcache', 'patch-transcache.sql' ), array( 'trackbacks', 'patch-trackbacks.sql' ), + array( 'externallinks', 'patch-externallinks.sql' ), ); $wgNewFields = array(