namespace are changed
* Respect database prefix in dumpHTML.inc
* Removed read-only check from Database::query()
+* Added externallinks table, to track links to arbitrary URLs
Documentation:
* (bug 3306) Document $wgLocalTZoffset
* Fix XML validity checks in parser tests on PHP 5.1
* (bug 4377) "[" is not valid in URLs
* (bug 4453) fix for __TOC__ dollar-number breakage
+* Convert unnecessary URL escape codes in external links to their equivalent
+ character before doing anything with them. This prevents certain kinds of
+ spam filter evasion.
Upload:
* (bug 2527) Always set destination filename when new file is selected
}
}
+/**
+ * Make a URL index, appropriate for the el_index field of externallinks.
+ */
+function wfMakeUrlIndex( $url ) {
+ wfSuppressWarnings();
+ $bits = parse_url( $url );
+ wfRestoreWarnings();
+ if ( !$bits || $bits['scheme'] !== 'http' ) {
+ return false;
+ }
+ // Reverse the labels in the hostname, convert to lower case
+ $reversedHost = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
+ // Add an extra dot to the end
+ if ( substr( $reversedHost, -1 ) !== '.' ) {
+ $reversedHost .= '.';
+ }
+ // Reconstruct the pseudo-URL
+ $index = "http://$reversedHost";
+ // Leave out user and password. Add the port, path, query and fragment
+ if ( isset( $bits['port'] ) ) $index .= ':' . $bits['port'];
+ if ( isset( $bits['path'] ) ) {
+ $index .= $bits['path'];
+ } else {
+ $index .= '/';
+ }
+ if ( isset( $bits['query'] ) ) $index .= '?' . $bits['query'];
+ if ( isset( $bits['fragment'] ) ) $index .= '#' . $bits['fragment'];
+ return $index;
+}
+
?>
$mLinks, # Map of title strings to IDs for the links in the document
$mImages, # DB keys of the images used, in the array key only
$mTemplates, # Map of title strings to IDs for the template references, including broken ones
+ $mExternals, # URLs of external links, array key only
$mCategories, # Map of category names to sort keys
$mDb, # Database connection reference
$mOptions; # SELECT options to be used (array)
$this->mLinks =& $this->mParserOutput->getLinks();
$this->mImages =& $this->mParserOutput->getImages();
$this->mTemplates =& $this->mParserOutput->getTemplates();
+ $this->mExternals =& $this->mParserOutput->getExternalLinks();
$this->mCategories =& $this->mParserOutput->getCategories();
}
$this->incrTableUpdate( 'imagelinks', 'il', $this->getImageDeletions( $existing ),
$this->getImageInsertions( $existing ) );
+ # External links
+ $existing = $this->getExistingExternals();
+ $this->incrTableUpdate( 'externallinks', 'el', $this->getExternalDeletions( $existing ),
+ $this->getExternalInsertions( $existing ) );
+
# Category links
$existing = $this->getExistingCategories();
$this->incrTableUpdate( 'categorylinks', 'cl', $this->getCategoryDeletions( $existing ),
$this->dumbTableUpdate( 'imagelinks', $this->getImageInsertions(), 'il_from' );
$this->dumbTableUpdate( 'categorylinks', $this->getCategoryInsertions(), 'cl_from' );
$this->dumbTableUpdate( 'templatelinks', $this->getTemplateInsertions(), 'tl_from' );
+ $this->dumbTableUpdate( 'externallinks', $this->getExternalInsertions(), 'el_from' );
# Update the cache of all the category pages
$this->invalidateCategories( $categoryUpdates );
function getImageInsertions( $existing = array() ) {
$arr = array();
$diffs = array_diff_key( $this->mImages, $existing );
- foreach( $diffs as $iname => $val ) {
+ foreach( $diffs as $iname => $dummy ) {
$arr[] = array(
'il_from' => $this->mId,
'il_to' => $iname
return $arr;
}
+ /**
+ * Get an array of externallinks insertions. Skips the names specified in $existing
+ * @access private
+ */
+ function getExternalInsertions( $existing = array() ) {
+ $arr = array();
+ $diffs = array_diff_key( $this->mExternals, $existing );
+ foreach( $diffs as $url => $dummy ) {
+ $arr[] = array(
+ 'el_from' => $this->mId,
+ 'el_to' => $url,
+ 'el_index' => wfMakeUrlIndex( $url ),
+ );
+ }
+ return $arr;
+ }
+
/**
* Get an array of category insertions
* @param array $existing Array mapping existing category names to sort keys. If both
return array_diff_key( $existing, $this->mImages );
}
+ /**
+ * Given an array of existing external links, returns those links which are not
+ * in $this and thus should be deleted.
+ * @access private
+ */
+ function getExternalDeletions( $existing ) {
+ return array_diff_key( $existing, $this->mExternals );
+ }
+
/**
* Given an array of existing categories, returns those categories which are not in $this
* and thus should be deleted.
}
$arr[$row->pl_namespace][$row->pl_title] = 1;
}
+ $this->mDb->freeResult( $res );
return $arr;
}
}
$arr[$row->tl_namespace][$row->tl_title] = 1;
}
+ $this->mDb->freeResult( $res );
return $arr;
}
while ( $row = $this->mDb->fetchObject( $res ) ) {
$arr[$row->il_to] = 1;
}
+ $this->mDb->freeResult( $res );
+ return $arr;
+ }
+
+ /**
+ * Get an array of existing external links, URLs in the keys
+ * @access private
+ */
+ function getExistingExternals() {
+ $fname = 'LinksUpdate::getExistingExternals';
+ $res = $this->mDb->select( 'externallinks', array( 'el_to' ),
+ array( 'el_from' => $this->mId ), $fname, $this->mOptions );
+ $arr = array();
+ while ( $row = $this->mDb->fetchObject( $res ) ) {
+ $arr[$row->el_to] = 1;
+ }
+ $this->mDb->freeResult( $res );
return $arr;
}
while ( $row = $this->mDb->fetchObject( $res ) ) {
$arr[$row->cl_to] = $row->cl_sortkey;
}
+ $this->mDb->freeResult( $res );
return $arr;
}
}
# Replace & from obsolete syntax with &.
# All HTML entities will be escaped by makeExternalLink()
- # or maybeMakeExternalImage()
$url = str_replace( '&', '&', $url );
+ # Replace unnecessary URL escape codes with the referenced character
+ # This prevents spammers from hiding links from the filters
+ $url = Parser::replaceUnusualEscapes( $url );
# Process the trail (i.e. everything after this link up until start of the next link),
# replacing any non-bracketed links
$trail = $this->replaceFreeExternalLinks( $trail );
-
# Use the encoded URL
# This means that users can paste URLs directly into the text
# Funny characters like ö aren't valid in URLs anyway
# This was changed in August 2004
$s .= $sk->makeExternalLink( $url, $text, false, $linktype ) . $dtrail . $trail;
+
+ # Register link in the output object
+ $this->mOutput->addExternalLink( $url );
}
wfProfileOut( $fname );
# All HTML entities will be escaped by makeExternalLink()
# or maybeMakeExternalImage()
$url = str_replace( '&', '&', $url );
+ # Replace unnecessary URL escape codes with their equivalent characters
+ $url = Parser::replaceUnusualEscapes( $url );
# Is this an external image?
$text = $this->maybeMakeExternalImage( $url );
if ( $text === false ) {
# Not an image, make a link
$text = $sk->makeExternalLink( $url, $wgContLang->markNoConversion($url), true, 'free' );
+ # Register it in the output object
+ $this->mOutput->addExternalLink( $url );
}
$s .= $text . $trail;
} else {
return $s;
}
+ /**
+ * Replace unusual URL escape codes with their equivalent characters
+ * @param string
+ * @return string
+ * @static
+ */
+ function replaceUnusualEscapes( $url ) {
+ return preg_replace_callback( '/%[0-9A-Fa-f]{2}/',
+ array( 'Parser', 'replaceUnusualEscapesCallback' ), $url );
+ }
+
+ /**
+ * Callback function used in replaceUnusualEscapes().
+ * Replaces unusual URL escape codes with their equivalent character
+ * @static
+ * @access private
+ */
+ function replaceUnusualEscapesCallback( $matches ) {
+ $char = urldecode( $matches[0] );
+ $ord = ord( $char );
+ // Is it an unsafe or HTTP reserved character according to RFC 1738?
+ if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) {
+ // No, shouldn't be escaped
+ return $char;
+ } else {
+ // Yes, leave it escaped
+ return $matches[0];
+ }
+ }
+
/**
* make an image if it's allowed, either through the global
* option or through the exception
$mTitleText, # title text of the chosen language variant
$mLinks, # 2-D map of NS/DBK to ID for the links in the document. ID=zero for broken.
$mTemplates, # 2-D map of NS/DBK to ID for the template references. ID=zero for broken.
- $mImages; # DB keys of the images used, in the array key only
+ $mImages, # DB keys of the images used, in the array key only
+ $mExternalLinks; # External link URLs, in the key only
function ParserOutput( $text = '', $languageLinks = array(), $categoryLinks = array(),
$containsOldMagic = false, $titletext = '' )
$this->mLinks = array();
$this->mTemplates = array();
$this->mImages = array();
+ $this->mExternalLinks = array();
}
function getText() { return $this->mText; }
function &getLinks() { return $this->mLinks; }
function &getTemplates() { return $this->mTemplates; }
function &getImages() { return $this->mImages; }
+ function &getExternalLinks() { return $this->mExternalLinks; }
function containsOldMagic() { return $this->mContainsOldMagic; }
function setText( $text ) { return wfSetVar( $this->mText, $text ); }
function addCategory( $c, $sort ) { $this->mCategories[$c] = $sort; }
function addImage( $name ) { $this->mImages[$name] = 1; }
function addLanguageLink( $t ) { $this->mLanguageLinks[] = $t; }
+ function addExternalLink( $url ) { $this->mExternalLinks[$url] = 1; }
function addLink( $title, $id ) {
$ns = $title->getNamespace();
--- /dev/null
+--
+-- Track links to external URLs
+--
+CREATE TABLE /*$wgDBprefix*/externallinks (
+ el_from int(8) unsigned NOT NULL default '0',
+ el_to blob NOT NULL default '',
+ el_index blob NOT NULL default '',
+
+ KEY (el_from, el_to(40)),
+ KEY (el_to(60), el_from),
+ KEY (el_index(60))
+) TYPE=InnoDB;
+
) TYPE=InnoDB, DEFAULT CHARSET=utf8;
+--
+-- Track links to external URLs
+--
+CREATE TABLE /*$wgDBprefix*/externallinks (
+ -- page_id of the referring page
+ el_from int(8) unsigned NOT NULL default '0',
+
+ -- The URL
+ el_to blob NOT NULL default '',
+
+ -- In the case of HTTP URLs, this is the URL with any username or password
+ -- removed, and with the labels in the hostname reversed and converted to
+ -- lower case. An extra dot is added to allow for matching of either
+ -- example.com or *.example.com in a single scan.
+ -- Example:
+ -- http://user:password@sub.example.com/page.html
+ -- becomes
+ -- http://com.example.sub./page.html
+ -- which allows for fast searching for all pages under example.com with the
+ -- clause:
+ -- WHERE el_index LIKE 'http://com.example.%'
+ el_index blob NOT NULL default '',
+
+ KEY (el_from, el_to(40)),
+ KEY (el_to(60), el_from),
+ KEY (el_index(60))
+) TYPE=InnoDB, DEFAULT CHARSET=utf8;
+
--
-- Contains a single row with some aggregate info
-- on the state of the site.
}
function fixLinksFromArticle( $id ) {
- global $wgTitle, $wgArticle, $wgOut, $wgParser, $wgLinkCache;
+ global $wgTitle, $wgArticle, $wgOut, $wgParser;
$wgTitle = Title::newFromID( $id );
$dbw =& wfGetDB( DB_MASTER );
'pagelinks' => 'pl_from',
'imagelinks' => 'il_from',
'categorylinks' => 'cl_from',
+ 'templatelinks' => 'tl_from',
+ 'externallinks' => 'el_from',
);
$page = $dbw->tableName( 'page' );
) TYPE=InnoDB;
+--
+-- Track links to external URLs
+--
+CREATE TABLE /*$wgDBprefix*/externallinks (
+ -- page_id of the referring page
+ el_from int(8) unsigned NOT NULL default '0',
+
+ -- The URL
+ el_to blob NOT NULL default '',
+
+ -- In the case of HTTP URLs, this is the URL with any username or password
+ -- removed, and with the labels in the hostname reversed and converted to
+ -- lower case. An extra dot is added to allow for matching of either
+ -- example.com or *.example.com in a single scan.
+ -- Example:
+ -- http://user:password@sub.example.com/page.html
+ -- becomes
+ -- http://com.example.sub./page.html
+ -- which allows for fast searching for all pages under example.com with the
+ -- clause:
+ -- WHERE el_index LIKE 'http://com.example.%'
+ el_index blob NOT NULL default '',
+
+ KEY (el_from, el_to(40)),
+ KEY (el_to(60), el_from),
+ KEY (el_index(60))
+) TYPE=InnoDB;
+
--
-- Contains a single row with some aggregate info
-- on the state of the site.
array( 'user_newtalk', 'patch-usernewtalk2.sql' ),
array( 'transcache', 'patch-transcache.sql' ),
array( 'trackbacks', 'patch-trackbacks.sql' ),
+ array( 'externallinks', 'patch-externallinks.sql' ),
);
$wgNewFields = array(