the current parse language where available.
==== Changed configuration ====
+* Some external link searches will not work correctly until update.php (or
+ refreshExternallinksIndex.php) is run. These include searches for links using
+ IP addresses, internationalized domain names, and possibly mailto links.
* …
==== Removed configuration ====
'RedisConnectionPool' => __DIR__ . '/includes/libs/redis/RedisConnectionPool.php',
'RedisLockManager' => __DIR__ . '/includes/libs/lockmanager/RedisLockManager.php',
'RedisPubSubFeedEngine' => __DIR__ . '/includes/rcfeed/RedisPubSubFeedEngine.php',
+ 'RefreshExternallinksIndex' => __DIR__ . '/maintenance/refreshExternallinksIndex.php',
'RefreshFileHeaders' => __DIR__ . '/maintenance/refreshFileHeaders.php',
'RefreshImageMetadata' => __DIR__ . '/maintenance/refreshImageMetadata.php',
'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php',
/**
* Make URL indexes, appropriate for the el_index field of externallinks.
*
+ * @deprecated since 1.33, use LinkFilter::makeIndexes() instead
* @param string $url
* @return array
*/
function wfMakeUrlIndexes( $url ) {
- $bits = wfParseUrl( $url );
-
- // Reverse the labels in the hostname, convert to lower case
- // For emails reverse domainpart only
- if ( $bits['scheme'] == 'mailto' ) {
- $mailparts = explode( '@', $bits['host'], 2 );
- if ( count( $mailparts ) === 2 ) {
- $domainpart = strtolower( implode( '.', array_reverse( explode( '.', $mailparts[1] ) ) ) );
- } else {
- // No domain specified, don't mangle it
- $domainpart = '';
- }
- $reversedHost = $domainpart . '@' . $mailparts[0];
- } else {
- $reversedHost = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
- }
- // Add an extra dot to the end
- // Why? Is it in wrong place in mailto links?
- if ( substr( $reversedHost, -1, 1 ) !== '.' ) {
- $reversedHost .= '.';
- }
- // Reconstruct the pseudo-URL
- $prot = $bits['scheme'];
- $index = $prot . $bits['delimiter'] . $reversedHost;
- // Leave out user and password. Add the port, path, query and fragment
- if ( isset( $bits['port'] ) ) {
- $index .= ':' . $bits['port'];
- }
- if ( isset( $bits['path'] ) ) {
- $index .= $bits['path'];
- } else {
- $index .= '/';
- }
- if ( isset( $bits['query'] ) ) {
- $index .= '?' . $bits['query'];
- }
- if ( isset( $bits['fragment'] ) ) {
- $index .= '#' . $bits['fragment'];
- }
-
- if ( $prot == '' ) {
- return [ "http:$index", "https:$index" ];
- } else {
- return [ $index ];
- }
+ wfDeprecated( __FUNCTION__, '1.33' );
+ return LinkFilter::makeIndexes( $url );
}
/**
* Another cool thing to do would be a web interface for fast spam removal.
*/
class LinkFilter {
+ /**
+ * Increment this when makeIndexes output changes. It'll cause
+ * maintenance/refreshExternallinksIndex.php to run from update.php.
+ */
+ const VERSION = 1;
/**
* Check whether $content contains a link to $filterEntry
/**
* Builds a regex pattern for $filterEntry.
*
+ * @todo This doesn't match the rest of the functionality here.
* @param string $filterEntry URL, if it begins with "*.", it'll be
* replaced to match any subdomain
* @param string $protocol 'http://' or 'https://'
}
/**
- * Make an array to be used for calls to Database::buildLike(), which
- * will match the specified string. There are several kinds of filter entry:
- * *.domain.com - Produces http://com.domain.%, matches domain.com
- * and www.domain.com
- * domain.com - Produces http://com.domain./%, matches domain.com
- * or domain.com/ but not www.domain.com
- * *.domain.com/x - Produces http://com.domain.%/x%, matches
- * www.domain.com/xy
- * domain.com/x - Produces http://com.domain./x%, matches
- * domain.com/xy but not www.domain.com/xy
+ * Indicate whether LinkFilter IDN support is available
+ * @since 1.33
+ * @return bool
+ */
+ public static function supportsIDN() {
+ return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' );
+ }
+
+ /**
+ * Canonicalize a hostname for el_index
+ * @param string $hose
+ * @return string
+ */
+ private static function indexifyHost( $host ) {
+ // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
+
+ // Canonicalize.
+ $host = rawurldecode( $host );
+ if ( $host !== '' && self::supportsIDN() ) {
+ // @todo Add a PHP fallback
+ $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
+ if ( $tmp !== false ) {
+ $host = $tmp;
+ }
+ }
+ $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
+ if ( StringUtils::isUtf8( $host ) ) {
+ // Save a little space by not percent-encoding valid UTF-8 bytes
+ $okChars .= '\x80-\xf4';
+ }
+ $host = preg_replace_callback(
+ '<[^' . $okChars . ']>',
+ function ( $m ) {
+ return rawurlencode( $m[0] );
+ },
+ strtolower( $host )
+ );
+
+ // IPv6? RFC 3986 syntax.
+ if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
+ $ip = $m[1];
+ if ( IP::isValid( $ip ) ) {
+ return 'V6.' . implode( '.', explode( ':', IP::sanitizeIP( $ip ) ) ) . '.';
+ }
+ if ( substr( $ip, -2 ) === ':*' ) {
+ $cutIp = substr( $ip, 0, -2 );
+ if ( IP::isValid( "{$cutIp}::" ) ) {
+ // Wildcard IP doesn't contain "::", so multiple parts can be wild
+ $ct = count( explode( ':', $ip ) ) - 1;
+ return 'V6.' .
+ implode( '.', array_slice( explode( ':', IP::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
+ '.*.';
+ }
+ if ( IP::isValid( "{$cutIp}:1" ) ) {
+ // Wildcard IP does contain "::", so only the last part is wild
+ return 'V6.' .
+ substr( implode( '.', explode( ':', IP::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
+ '*.';
+ }
+ }
+ }
+
+ // Regularlize explicit specification of the DNS root.
+ // Browsers seem to do this for IPv4 literals too.
+ if ( substr( $host, -1 ) === '.' ) {
+ $host = substr( $host, 0, -1 );
+ }
+
+ // IPv4?
+ $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
+ if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
+ return 'V4.' . implode( '.', array_map( function ( $v ) {
+ return $v === '*' ? $v : (int)$v;
+ }, explode( '.', $host ) ) ) . '.';
+ }
+
+ // Must be a host name.
+ return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
+ }
+
+ /**
+ * Converts a URL into a format for el_index
+ * @since 1.33
+ * @param string $url
+ * @return string[] Usually one entry, but might be two in case of
+ * protocol-relative URLs. Empty array on error.
+ */
+ public static function makeIndexes( $url ) {
+ // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
+
+ // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
+ // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
+ // versus "https://" prefix. If you change that, you'll likely need to update
+ // refreshExternallinksIndex.php accordingly.
+
+ $bits = wfParseUrl( $url );
+ if ( !$bits ) {
+ return [];
+ }
+
+ // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
+ // For emails turn it into "domain.reversed@localpart"
+ if ( $bits['scheme'] == 'mailto' ) {
+ $mailparts = explode( '@', $bits['host'], 2 );
+ if ( count( $mailparts ) === 2 ) {
+ $domainpart = self::indexifyHost( $mailparts[1] );
+ } else {
+ // No @, assume it's a local part with no domain
+ $domainpart = '';
+ }
+ $bits['host'] = $domainpart . '@' . $mailparts[0];
+ } else {
+ $bits['host'] = self::indexifyHost( $bits['host'] );
+ }
+
+ // Reconstruct the pseudo-URL
+ $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
+ // Leave out user and password. Add the port, path, query and fragment
+ if ( isset( $bits['port'] ) ) {
+ $index .= ':' . $bits['port'];
+ }
+ if ( isset( $bits['path'] ) ) {
+ $index .= $bits['path'];
+ } else {
+ $index .= '/';
+ }
+ if ( isset( $bits['query'] ) ) {
+ $index .= '?' . $bits['query'];
+ }
+ if ( isset( $bits['fragment'] ) ) {
+ $index .= '#' . $bits['fragment'];
+ }
+
+ if ( $bits['scheme'] == '' ) {
+ return [ "http:$index", "https:$index" ];
+ } else {
+ return [ $index ];
+ }
+ }
+
+ /**
+ * Return query conditions which will match the specified string. There are
+ * several kinds of filter entry:
+ *
+ * *.domain.com - Matches domain.com and www.domain.com
+ * domain.com - Matches domain.com or domain.com/ but not www.domain.com
+ * *.domain.com/x - Matches domain.com/xy or www.domain.com/xy. Also probably matches
+ * domain.com/foobar/xy due to limitations of LIKE syntax.
+ * domain.com/x - Matches domain.com/xy but not www.domain.com/xy
+ * 192.0.2.* - Matches any IP in 192.0.2.0/24. Can also have a path appended.
+ * [2001:db8::*] - Matches any IP in 2001:db8::/112. Can also have a path appended.
+ * [2001:db8:*] - Matches any IP in 2001:db8::/32. Can also have a path appended.
+ * foo@domain.com - With protocol 'mailto:', matches the email address foo@domain.com.
+ * *@domain.com - With protocol 'mailto:', matches any email address at domain.com, but
+ * not subdomains like foo@mail.domain.com
*
* Asterisks in any other location are considered invalid.
*
- * This function does the same as wfMakeUrlIndexes(), except it also takes care
+ * @since 1.33
+ * @param string $filterEntry Filter entry, as described above
+ * @param array $options Options are:
+ * - protocol: (string) Protocol to query (default http://)
+ * - oneWildcard: (bool) Stop at the first wildcard (default false)
+ * - prefix: (string) Field prefix (default 'el'). The query will test
+ * fields '{$prefix}_index' and '{$prefix}_index_60'
+ * - db: (IDatabase|null) Database to use.
+ * @return array|bool Conditions to be used for the query (to be ANDed) or
+ * false on error. To determine if the query is constant on the
+ * el_index_60 field, check whether key 'el_index_60' is set.
+ */
+ public static function getQueryConditions( $filterEntry, array $options = [] ) {
+ $options += [
+ 'protocol' => 'http://',
+ 'oneWildcard' => false,
+ 'prefix' => 'el',
+ 'db' => null,
+ ];
+
+ // First, get the like array
+ $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
+ if ( $like === false ) {
+ return $like;
+ }
+
+ // Get the constant prefix (i.e. everything up to the first wildcard)
+ $trimmedLike = self::keepOneWildcard( $like );
+ if ( $options['oneWildcard'] ) {
+ $like = $trimmedLike;
+ }
+ if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
+ array_pop( $trimmedLike );
+ }
+ $index = implode( '', $trimmedLike );
+
+ $p = $options['prefix'];
+ $db = $options['db'] ?: wfGetDB( DB_REPLICA );
+
+ // Build the query
+ $l = strlen( $index );
+ if ( $l >= 60 ) {
+ // The constant prefix is larger than el_index_60, so we can use a
+ // constant comparison.
+ return [
+ "{$p}_index_60" => substr( $index, 0, 60 ),
+ "{$p}_index" . $db->buildLike( $like ),
+ ];
+ }
+
+ // The constant prefix is smaller than el_index_60, so we use a LIKE
+ // for a prefix search.
+ return [
+ "{$p}_index_60" . $db->buildLike( [ $index, $db->anyString() ] ),
+ "{$p}_index" . $db->buildLike( $like ),
+ ];
+ }
+
+ /**
+ * Make an array to be used for calls to Database::buildLike(), which
+ * will match the specified string.
+ *
+ * This function does the same as LinkFilter::makeIndexes(), except it also takes care
* of adding wildcards
*
- * @param string $filterEntry Domainparts
+ * @note You probably want self::getQueryConditions() instead
+ * @param string $filterEntry Filter entry, @see self::getQueryConditions()
* @param string $protocol Protocol (default http://)
* @return array|bool Array to be passed to Database::buildLike() or false on error
*/
$target = $protocol . $filterEntry;
$bits = wfParseUrl( $target );
-
- if ( $bits == false ) {
- // Unknown protocol?
+ if ( !$bits ) {
return false;
}
- if ( substr( $bits['host'], 0, 2 ) == '*.' ) {
- $subdomains = true;
- $bits['host'] = substr( $bits['host'], 2 );
- if ( $bits['host'] == '' ) {
- // We don't want to make a clause that will match everything,
- // that could be dangerous
- return false;
- }
- } else {
- $subdomains = false;
- }
-
- // Reverse the labels in the hostname, convert to lower case
- // For emails reverse domainpart only
+ $subdomains = false;
if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
- // complete email address
- $mailparts = explode( '@', $bits['host'] );
- $domainpart = strtolower( implode( '.', array_reverse( explode( '.', $mailparts[1] ) ) ) );
- $bits['host'] = $domainpart . '@' . $mailparts[0];
- } elseif ( $bits['scheme'] === 'mailto' ) {
- // domainpart of email address only, do not add '.'
- $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
+ // Email address with domain and non-empty local part
+ $mailparts = explode( '@', $bits['host'], 2 );
+ $domainpart = self::indexifyHost( $mailparts[1] );
+ if ( $mailparts[0] === '*' ) {
+ $subdomains = true;
+ $bits['host'] = $domainpart . '@';
+ } else {
+ $bits['host'] = $domainpart . '@' . $mailparts[0];
+ }
} else {
- $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
- if ( substr( $bits['host'], -1, 1 ) !== '.' ) {
- $bits['host'] .= '.';
+ // Non-email, or email with only a domain part.
+ $bits['host'] = self::indexifyHost( $bits['host'] );
+ if ( substr( $bits['host'], -3 ) === '.*.' ) {
+ $subdomains = true;
+ $bits['host'] = substr( $bits['host'], 0, -2 );
}
}
* Filters an array returned by makeLikeArray(), removing everything past first
* pattern placeholder.
*
+ * @note You probably want self::getQueryConditions() instead
* @param array $arr Array to filter
* @return array Filtered array
*/
}
/**
+ * @deprecated since 1.33, use LinkFilter::getQueryConditions() instead
* @param string|null $query
* @param string|null $protocol
* @return null|string
*/
public function prepareUrlQuerySearchString( $query = null, $protocol = null ) {
+ wfDeprecated( __METHOD__, '1.33' );
$db = $this->getDB();
- if ( !is_null( $query ) || $query != '' ) {
+ if ( $query !== null && $query !== '' ) {
if ( is_null( $protocol ) ) {
$protocol = 'http://';
}
*/
private function run( $resultPageSet = null ) {
$params = $this->extractRequestParams();
+ $db = $this->getDB();
$query = $params['query'];
$protocol = self::getProtocolPrefix( $params['protocol'] );
- $this->addTables( [ 'page', 'externallinks' ] ); // must be in this order for 'USE INDEX'
- $this->addOption( 'USE INDEX', 'el_index' );
+ $this->addTables( [ 'page', 'externallinks' ] );
$this->addWhere( 'page_id=el_from' );
$miser_ns = [];
$this->addWhereFld( 'page_namespace', $params['namespace'] );
}
- // Normalize query to match the normalization applied for the externallinks table
- $query = Parser::normalizeLinkUrl( $query );
+ $orderBy = [];
- $whereQuery = $this->prepareUrlQuerySearchString( $query, $protocol );
+ if ( $query !== null && $query !== '' ) {
+ if ( $protocol === null ) {
+ $protocol = 'http://';
+ }
+
+ // Normalize query to match the normalization applied for the externallinks table
+ $query = Parser::normalizeLinkUrl( $protocol . $query );
+
+ $conds = LinkFilter::getQueryConditions( $query, [
+ 'protocol' => '',
+ 'oneWildcard' => true,
+ 'db' => $db
+ ] );
+ if ( !$conds ) {
+ $this->dieWithError( 'apierror-badquery' );
+ }
+ $this->addWhere( $conds );
+ if ( !isset( $conds['el_index_60'] ) ) {
+ $orderBy[] = 'el_index_60';
+ }
+ } else {
+ $orderBy[] = 'el_index_60';
- if ( $whereQuery !== null ) {
- $this->addWhere( $whereQuery );
+ if ( $protocol !== null ) {
+ $this->addWhere( 'el_index_60' . $db->buildLike( "$protocol", $db->anyString() ) );
+ } else {
+ // We're querying all protocols, filter out duplicate protocol-relative links
+ $this->addWhere( $db->makeList( [
+ 'el_to NOT' . $db->buildLike( '//', $db->anyString() ),
+ 'el_index_60 ' . $db->buildLike( 'http://', $db->anyString() ),
+ ], LIST_OR ) );
+ }
}
+ $orderBy[] = 'el_id';
+ $this->addOption( 'ORDER BY', $orderBy );
+ $this->addFields( $orderBy ); // Make sure
+
$prop = array_flip( $params['prop'] );
$fld_ids = isset( $prop['ids'] );
$fld_title = isset( $prop['title'] );
}
$limit = $params['limit'];
- $offset = $params['offset'];
$this->addOption( 'LIMIT', $limit + 1 );
- if ( isset( $offset ) ) {
- $this->addOption( 'OFFSET', $offset );
+
+ if ( $params['continue'] !== null ) {
+ $cont = explode( '|', $params['continue'] );
+ $this->dieContinueUsageIf( count( $cont ) !== count( $orderBy ) );
+ $i = count( $cont ) - 1;
+ $cond = $orderBy[$i] . ' >= ' . $db->addQuotes( rawurldecode( $cont[$i] ) );
+ while ( $i-- > 0 ) {
+ $field = $orderBy[$i];
+ $v = $db->addQuotes( rawurldecode( $cont[$i] ) );
+ $cond = "($field > $v OR ($field = $v AND $cond))";
+ }
+ $this->addWhere( $cond );
}
$res = $this->select( __METHOD__ );
if ( ++$count > $limit ) {
// We've reached the one extra which shows that there are
// additional pages to be had. Stop here...
- $this->setContinueEnumParameter( 'offset', $offset + $limit );
+ $this->setContinue( $orderBy, $row );
break;
}
}
$fit = $result->addValue( [ 'query', $this->getModuleName() ], null, $vals );
if ( !$fit ) {
- $this->setContinueEnumParameter( 'offset', $offset + $count - 1 );
+ $this->setContinue( $orderBy, $row );
break;
}
} else {
}
}
+ private function setContinue( $orderBy, $row ) {
+ $fields = [];
+ foreach ( $orderBy as $field ) {
+ $fields[] = strtr( $row->$field, [ '%' => '%25', '|' => '%7C' ] );
+ }
+ $this->setContinueEnumParameter( 'continue', implode( '|', $fields ) );
+ }
+
public function getAllowedParams() {
$ret = [
'prop' => [
],
ApiBase::PARAM_HELP_MSG_PER_VALUE => [],
],
- 'offset' => [
- ApiBase::PARAM_TYPE => 'integer',
+ 'continue' => [
ApiBase::PARAM_HELP_MSG => 'api-help-param-continue',
],
'protocol' => [
}
$params = $this->extractRequestParams();
+ $db = $this->getDB();
$query = $params['query'];
$protocol = ApiQueryExtLinksUsage::getProtocolPrefix( $params['protocol'] );
$this->addTables( 'externallinks' );
$this->addWhereFld( 'el_from', array_keys( $this->getPageSet()->getGoodTitles() ) );
- $whereQuery = $this->prepareUrlQuerySearchString( $query, $protocol );
-
- if ( $whereQuery !== null ) {
- $this->addWhere( $whereQuery );
- }
+ $orderBy = [];
// Don't order by el_from if it's constant in the WHERE clause
if ( count( $this->getPageSet()->getGoodTitles() ) != 1 ) {
- $this->addOption( 'ORDER BY', 'el_from' );
+ $orderBy[] = 'el_from';
}
- // If we're querying all protocols, use DISTINCT to avoid repeating protocol-relative links twice
- if ( $protocol === null ) {
- $this->addOption( 'DISTINCT' );
+ if ( $query !== null && $query !== '' ) {
+ if ( $protocol === null ) {
+ $protocol = 'http://';
+ }
+
+ // Normalize query to match the normalization applied for the externallinks table
+ $query = Parser::normalizeLinkUrl( $protocol . $query );
+
+ $conds = LinkFilter::getQueryConditions( $query, [
+ 'protocol' => '',
+ 'oneWildcard' => true,
+ 'db' => $db
+ ] );
+ if ( !$conds ) {
+ $this->dieWithError( 'apierror-badquery' );
+ }
+ $this->addWhere( $conds );
+ if ( !isset( $conds['el_index_60'] ) ) {
+ $orderBy[] = 'el_index_60';
+ }
+ } else {
+ $orderBy[] = 'el_index_60';
+
+ if ( $protocol !== null ) {
+ $this->addWhere( 'el_index_60' . $db->buildLike( "$protocol", $db->anyString() ) );
+ } else {
+ // We're querying all protocols, filter out duplicate protocol-relative links
+ $this->addWhere( $db->makeList( [
+ 'el_to NOT' . $db->buildLike( '//', $db->anyString() ),
+ 'el_index_60 ' . $db->buildLike( 'http://', $db->anyString() ),
+ ], LIST_OR ) );
+ }
}
+ $orderBy[] = 'el_id';
+ $this->addOption( 'ORDER BY', $orderBy );
+ $this->addFields( $orderBy ); // Make sure
+
$this->addOption( 'LIMIT', $params['limit'] + 1 );
- $offset = $params['offset'] ?? 0;
- if ( $offset ) {
- $this->addOption( 'OFFSET', $params['offset'] );
+
+ if ( $params['continue'] !== null ) {
+ $cont = explode( '|', $params['continue'] );
+ $this->dieContinueUsageIf( count( $cont ) !== count( $orderBy ) );
+ $i = count( $cont ) - 1;
+ $cond = $orderBy[$i] . ' >= ' . $db->addQuotes( rawurldecode( $cont[$i] ) );
+ while ( $i-- > 0 ) {
+ $field = $orderBy[$i];
+ $v = $db->addQuotes( rawurldecode( $cont[$i] ) );
+ $cond = "($field > $v OR ($field = $v AND $cond))";
+ }
+ $this->addWhere( $cond );
}
$res = $this->select( __METHOD__ );
if ( ++$count > $params['limit'] ) {
// We've reached the one extra which shows that
// there are additional pages to be had. Stop here...
- $this->setContinueEnumParameter( 'offset', $offset + $params['limit'] );
+ $this->setContinue( $orderBy, $row );
break;
}
$entry = [];
ApiResult::setContentValue( $entry, 'url', $to );
$fit = $this->addPageSubItem( $row->el_from, $entry );
if ( !$fit ) {
- $this->setContinueEnumParameter( 'offset', $offset + $count - 1 );
+ $this->setContinue( $orderBy, $row );
break;
}
}
}
+ private function setContinue( $orderBy, $row ) {
+ $fields = [];
+ foreach ( $orderBy as $field ) {
+ $fields[] = strtr( $row->$field, [ '%' => '%25', '|' => '%7C' ] );
+ }
+ $this->setContinueEnumParameter( 'continue', implode( '|', $fields ) );
+ }
+
public function getCacheMode( $params ) {
return 'public';
}
ApiBase::PARAM_MAX => ApiBase::LIMIT_BIG1,
ApiBase::PARAM_MAX2 => ApiBase::LIMIT_BIG2
],
- 'offset' => [
- ApiBase::PARAM_TYPE => 'integer',
+ 'continue' => [
ApiBase::PARAM_HELP_MSG => 'api-help-param-continue',
],
'protocol' => [
$arr = [];
$diffs = array_diff_key( $this->mExternals, $existing );
foreach ( $diffs as $url => $dummy ) {
- foreach ( wfMakeUrlIndexes( $url ) as $index ) {
+ foreach ( LinkFilter::makeIndexes( $url ) as $index ) {
$arr[] = [
'el_from' => $this->mId,
'el_to' => $url,
AddRFCandPMIDInterwiki::class,
PopulatePPSortKey::class,
PopulateIpChanges::class,
+ RefreshExternallinksIndex::class,
];
/**
* @return string
*/
public static function normalizeLinkUrl( $url ) {
- # First, make sure unsafe characters are encoded
+ # Test for RFC 3986 IPv6 syntax
+ $scheme = '[a-z][a-z0-9+.-]*:';
+ $userinfo = '(?:[a-z0-9\-._~!$&\'()*+,;=:]|%[0-9a-f]{2})*';
+ $ipv6Host = '\\[((?:[0-9a-f:]|%3[0-A]|%[46][1-6])+)\\]';
+ if ( preg_match( "<^(?:{$scheme})?//(?:{$userinfo}@)?{$ipv6Host}(?:[:/?#].*|)$>i", $url, $m ) &&
+ IP::isValid( rawurldecode( $m[1] ) )
+ ) {
+ $isIPv6 = rawurldecode( $m[1] );
+ } else {
+ $isIPv6 = false;
+ }
+
+ # Make sure unsafe characters are encoded
$url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/',
function ( $m ) {
return rawurlencode( $m[0] );
$ret = self::normalizeUrlComponent(
substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret;
+ # Fix IPv6 syntax
+ if ( $isIPv6 !== false ) {
+ $ipv6Host = "%5B({$isIPv6})%5D";
+ $ret = preg_replace(
+ "<^((?:{$scheme})?//(?:{$userinfo}@)?){$ipv6Host}(?=[:/?#]|$)>i",
+ "$1[$2]",
+ $ret
+ );
+ }
+
return $ret;
}
}
}
- $target2 = $target;
+ $target2 = Parser::normalizeLinkUrl( $target );
// Get protocol, default is http://
$protocol = 'http://';
$bits = wfParseUrl( $target );
if ( $target != '' ) {
$this->setParams( [
- 'query' => Parser::normalizeLinkUrl( $target2 ),
+ 'query' => $target2,
'namespace' => $namespace,
'protocol' => $protocol ] );
parent::execute( $par );
return false;
}
- /**
- * Return an appropriately formatted LIKE query and the clause
- *
- * @param string $query Search pattern to search for
- * @param string $prot Protocol, e.g. 'http://'
- *
- * @return array
- */
- static function mungeQuery( $query, $prot ) {
- $field = 'el_index';
- $dbr = wfGetDB( DB_REPLICA );
-
- if ( $query === '*' && $prot !== '' ) {
- // Allow queries like 'ftp://*' to find all ftp links
- $rv = [ $prot, $dbr->anyString() ];
- } else {
- $rv = LinkFilter::makeLikeArray( $query, $prot );
- }
-
- if ( $rv === false ) {
- // LinkFilter doesn't handle wildcard in IP, so we'll have to munge here.
- $pattern = '/^(:?[0-9]{1,3}\.)+\*\s*$|^(:?[0-9]{1,3}\.){3}[0-9]{1,3}:[0-9]*\*\s*$/';
- if ( preg_match( $pattern, $query ) ) {
- $rv = [ $prot . rtrim( $query, " \t*" ), $dbr->anyString() ];
- $field = 'el_to';
- }
- }
-
- return [ $rv, $field ];
- }
-
function linkParameters() {
$params = [];
$params['target'] = $this->mProt . $this->mQuery;
public function getQueryInfo() {
$dbr = wfGetDB( DB_REPLICA );
- // strip everything past first wildcard, so that
- // index-based-only lookup would be done
- list( $this->mungedQuery, $clause ) = self::mungeQuery( $this->mQuery, $this->mProt );
+
+ if ( $this->mQuery === '*' && $this->mProt !== '' ) {
+ $this->mungedQuery = [
+ 'el_index_60' . $dbr->buildLike( $this->mProt, $dbr->anyString() ),
+ ];
+ } else {
+ $this->mungedQuery = LinkFilter::getQueryConditions( $this->mQuery, [
+ 'protocol' => $this->mProt,
+ 'oneWildcard' => true,
+ 'db' => $dbr
+ ] );
+ }
if ( $this->mungedQuery === false ) {
// Invalid query; return no results
return [ 'tables' => 'page', 'fields' => 'page_id', 'conds' => '0=1' ];
}
- $stripped = LinkFilter::keepOneWildcard( $this->mungedQuery );
- $like = $dbr->buildLike( $stripped );
+ $orderBy = [];
+ if ( !isset( $this->mungedQuery['el_index_60'] ) ) {
+ $orderBy[] = 'el_index_60';
+ }
+ $orderBy[] = 'el_id';
+
$retval = [
'tables' => [ 'page', 'externallinks' ],
'fields' => [
'value' => 'el_index',
'url' => 'el_to'
],
- 'conds' => [
- 'page_id = el_from',
- "$clause $like"
- ],
- 'options' => [ 'USE INDEX' => $clause ]
+ 'conds' => array_merge(
+ [
+ 'page_id = el_from',
+ ],
+ $this->mungedQuery
+ ),
+ 'options' => [ 'ORDER BY' => $orderBy ]
];
if ( $this->mNs !== null && !$this->getConfig()->get( 'MiserMode' ) ) {
/**
* Override to squash the ORDER BY.
- * We do a truncated index search, so the optimizer won't trust
- * it as good enough for optimizing sort. The implicit ordering
- * from the scan will usually do well enough for our needs.
+ * Not much point in descending order here.
* @return array
*/
function getOrderFields() {
$spec = $this->getArg();
- $likes = [];
+ $protConds = [];
foreach ( [ 'http://', 'https://' ] as $prot ) {
- $like = LinkFilter::makeLikeArray( $spec, $prot );
- if ( !$like ) {
+ $conds = LinkFilter::getQueryConditions( $spec, [ 'protocol' => $prot ] );
+ if ( !$conds ) {
$this->fatalError( "Not a valid hostname specification: $spec" );
}
- $likes[$prot] = $like;
+ $protConds[$prot] = $conds;
}
if ( $this->hasOption( 'all' ) ) {
/** @var $dbr Database */
$dbr = $this->getDB( DB_REPLICA, [], $wikiID );
- foreach ( $likes as $like ) {
+ foreach ( $protConds as $conds ) {
$count = $dbr->selectField(
'externallinks',
'COUNT(*)',
- [ 'el_index' . $dbr->buildLike( $like ) ],
+ $conds,
__METHOD__
);
if ( $count ) {
$count = 0;
/** @var $dbr Database */
$dbr = $this->getDB( DB_REPLICA );
- foreach ( $likes as $prot => $like ) {
+ foreach ( $protConds as $prot => $conds ) {
$res = $dbr->select(
'externallinks',
[ 'DISTINCT el_from' ],
- [ 'el_index' . $dbr->buildLike( $like ) ],
+ $conds,
__METHOD__
);
$count = $dbr->numRows( $res );
public function execute() {
global $wgServer;
+
+ // Extract the host and scheme from $wgServer
+ $bits = wfParseUrl( $wgServer );
+ if ( !$bits ) {
+ $this->error( 'Could not parse $wgServer' );
+ exit( 1 );
+ }
+
$this->output( "Deleting self externals from $wgServer\n" );
$db = $this->getDB( DB_MASTER );
- while ( 1 ) {
- $this->commitTransaction( $db, __METHOD__ );
- $q = $db->limitResult( "DELETE /* deleteSelfExternals */ FROM externallinks WHERE el_to"
- . $db->buildLike( $wgServer . '/', $db->anyString() ), $this->getBatchSize() );
- $this->output( "Deleting a batch\n" );
- $db->query( $q );
- if ( !$db->affectedRows() ) {
- return;
+
+ // If it's protocol-relative, we need to do both http and https.
+ // Otherwise, just do the specified scheme.
+ $host = $bits['host'];
+ if ( isset( $bits['port'] ) ) {
+ $host .= ':' . $bits['port'];
+ }
+ if ( $bits['scheme'] != '' ) {
+ $conds = [ LinkFilter::getQueryConditions( $host, [ 'protocol' => $bits['scheme'] . '://' ] ) ];
+ } else {
+ $conds = [
+ LinkFilter::getQueryConditions( $host, [ 'protocol' => 'http://' ] ),
+ LinkFilter::getQueryConditions( $host, [ 'protocol' => 'https://' ] ),
+ ];
+ }
+
+ foreach ( $conds as $cond ) {
+ if ( !$cond ) {
+ continue;
}
+ $cond = $db->makeList( $cond, LIST_AND );
+ do {
+ $this->commitTransaction( $db, __METHOD__ );
+ $q = $db->limitResult( "DELETE /* deleteSelfExternals */ FROM externallinks WHERE $cond",
+ $this->mBatchSize );
+ $this->output( "Deleting a batch\n" );
+ $db->query( $q );
+ } while ( $db->affectedRows() );
}
}
}
-- which allows for fast searching for all pages under example.com with the
-- clause:
-- WHERE el_index LIKE 'http://com.example.%'
+ --
+ -- Note if you enable or disable PHP's intl extension, you'll need to run
+ -- maintenance/refreshExternallinksIndex.php to refresh this field.
el_index nvarchar(450) NOT NULL,
-- This is el_index truncated to 60 bytes to allow for sortable queries that
--- /dev/null
+<?php
+/**
+ * Refresh the externallinks table el_index and el_index_60 from el_to
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Maintenance
+ */
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script that refreshes the externallinks table el_index and
+ * el_index_60 from el_to
+ *
+ * @ingroup Maintenance
+ * @since 1.33
+ */
+class RefreshExternallinksIndex extends LoggedUpdateMaintenance {
+ public function __construct() {
+ parent::__construct();
+ $this->addDescription(
+ 'Refresh the externallinks table el_index and el_index_60 from el_to' );
+ $this->setBatchSize( 10000 );
+ }
+
+ protected function getUpdateKey() {
+ return static::class
+ . ' v' . LinkFilter::VERSION
+ . ( LinkFilter::supportsIDN() ? '+' : '-' ) . 'IDN';
+ }
+
+ protected function updateSkippedMessage() {
+ return 'externallinks table indexes up to date';
+ }
+
+ protected function doDBUpdates() {
+ $dbw = $this->getDB( DB_MASTER );
+ if ( !$dbw->tableExists( 'externallinks' ) ) {
+ $this->error( "externallinks table does not exist" );
+ return false;
+ }
+ $this->output( "Updating externallinks table index fields\n" );
+
+ $minmax = $dbw->selectRow(
+ 'externallinks',
+ [ 'min' => 'MIN(el_id)', 'max' => 'MAX(el_id)' ],
+ '',
+ __METHOD__
+ );
+
+ $updated = 0;
+ $deleted = 0;
+ $start = $minmax->min - 1;
+ $last = $minmax->max;
+ while ( $start < $last ) {
+ $end = min( $start + $this->mBatchSize, $last );
+ $this->output( "el_id $start - $end of $last\n" );
+ $res = $dbw->select( 'externallinks', [ 'el_id', 'el_to', 'el_index' ],
+ [
+ "el_id > $start",
+ "el_id <= $end",
+ ],
+ __METHOD__,
+ [ 'ORDER BY' => 'el_id' ]
+ );
+ foreach ( $res as $row ) {
+ $newIndexes = LinkFilter::makeIndexes( $row->el_to );
+ if ( !$newIndexes ) {
+ $dbw->delete( 'externallinks', [ 'el_id' => $row->el_id ], __METHOD__ );
+ $deleted++;
+ continue;
+ }
+ if ( in_array( $row->el_index, $newIndexes, true ) ) {
+ continue;
+ }
+
+ if ( count( $newIndexes ) === 1 ) {
+ $newIndex = $newIndexes[0];
+ } else {
+ // Assume the scheme is the only difference between the different $newIndexes.
+ // Keep this row's scheme, assuming there's another row with the other scheme.
+ $newIndex = substr( $row->el_index, 0, strpos( $row->el_index, ':' ) ) .
+ substr( $newIndexes[0], strpos( $newIndexes[0], ':' ) );
+ }
+ $dbw->update( 'externallinks',
+ [
+ 'el_index' => $newIndex,
+ 'el_index_60' => substr( $newIndex, 0, 60 ),
+ ],
+ [ 'el_id' => $row->el_id ],
+ __METHOD__
+ );
+ $updated++;
+ }
+ wfWaitForSlaves();
+ $start = $end;
+ }
+ $this->output( "Done, $updated rows updated, $deleted deleted.\n" );
+
+ return true;
+ }
+}
+
+$maintClass = "RefreshExternallinksIndex";
+require_once RUN_MAINTENANCE_IF_MAIN;
-- which allows for fast searching for all pages under example.com with the
-- clause:
-- WHERE el_index LIKE 'http://com.example.%'
+ --
+ -- Note if you enable or disable PHP's intl extension, you'll need to run
+ -- maintenance/refreshExternallinksIndex.php to refresh this field.
el_index blob NOT NULL,
-- This is el_index truncated to 60 bytes to allow for sortable queries that
];
}
- /**
- * @dataProvider provideMakeUrlIndexes()
- * @covers ::wfMakeUrlIndexes
- */
- public function testMakeUrlIndexes( $url, $expected ) {
- $index = wfMakeUrlIndexes( $url );
- $this->assertEquals( $expected, $index, "wfMakeUrlIndexes(\"$url\")" );
- }
-
- public static function provideMakeUrlIndexes() {
- return [
- // Testcase for T30627
- [
- 'https://example.org/test.cgi?id=12345',
- [ 'https://org.example./test.cgi?id=12345' ]
- ],
- [
- // mailtos are handled special
- // is this really right though? that final . probably belongs earlier?
- 'mailto:wiki@wikimedia.org',
- [ 'mailto:org.wikimedia@wiki.' ]
- ],
-
- // file URL cases per T30627...
- [
- // three slashes: local filesystem path Unix-style
- 'file:///whatever/you/like.txt',
- [ 'file://./whatever/you/like.txt' ]
- ],
- [
- // three slashes: local filesystem path Windows-style
- 'file:///c:/whatever/you/like.txt',
- [ 'file://./c:/whatever/you/like.txt' ]
- ],
- [
- // two slashes: UNC filesystem path Windows-style
- 'file://intranet/whatever/you/like.txt',
- [ 'file://intranet./whatever/you/like.txt' ]
- ],
- // Multiple-slash cases that can sorta work on Mozilla
- // if you hack it just right are kinda pathological,
- // and unreliable cross-platform or on IE which means they're
- // unlikely to appear on intranets.
- // Those will survive the algorithm but with results that
- // are less consistent.
-
- // protocol-relative URL cases per T31854...
- [
- '//example.org/test.cgi?id=12345',
- [
- 'http://org.example./test.cgi?id=12345',
- 'https://org.example./test.cgi?id=12345'
- ]
- ],
- ];
- }
-
/**
* @dataProvider provideWfMatchesDomainList
* @covers ::wfMatchesDomainList
[ 'http://', 'test.com', 'http://name:pass@test.com' ],
[ 'http://', '*.test.com', 'http://a.b.c.test.com/dir/dir/file?a=6' ],
[ null, 'http://*.test.com', 'http://www.test.com' ],
+ [ 'http://', '.test.com', 'http://.test.com' ],
+ [ 'http://', '*..test.com', 'http://foo..test.com' ],
[ 'mailto:', 'name@mail.test123.com', 'mailto:name@mail.test123.com' ],
+ [ 'mailto:', '*@mail.test123.com', 'mailto:name@mail.test123.com' ],
[ '',
'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg',
'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg'
'http://xx23124:__ffdfdef__@www.test.com:12345/dir' ,
'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg'
],
+ [ 'http://', '127.0.0.1', 'http://127.000.000.001' ],
+ [ 'http://', '127.0.0.*', 'http://127.000.000.010' ],
+ [ 'http://', '127.0.*', 'http://127.000.123.010' ],
+ [ 'http://', '127.*', 'http://127.127.127.127' ],
+ [ 'http://', '[0:0:0:0:0:0:0:0001]', 'http://[::1]' ],
+ [ 'http://', '[2001:db8:0:0:*]', 'http://[2001:0DB8::]' ],
+ [ 'http://', '[2001:db8:0:0:*]', 'http://[2001:0DB8::123]' ],
+ [ 'http://', '[2001:db8:0:0:*]', 'http://[2001:0DB8::123:456]' ],
+ [ 'http://', 'xn--f-vgaa.example.com', 'http://fóó.example.com', [ 'idn' => true ] ],
+ [ 'http://', 'xn--f-vgaa.example.com', 'http://f%c3%b3%C3%B3.example.com', [ 'idn' => true ] ],
+ [ 'http://', 'fóó.example.com', 'http://xn--f-vgaa.example.com', [ 'idn' => true ] ],
+ [ 'http://', 'f%c3%b3%C3%B3.example.com', 'http://xn--f-vgaa.example.com', [ 'idn' => true ] ],
+ [ 'http://', 'f%c3%b3%C3%B3.example.com', 'http://fóó.example.com' ],
+ [ 'http://', 'fóó.example.com', 'http://f%c3%b3%C3%B3.example.com' ],
+
+ [ 'http://', 'example.com./foo', 'http://example.com/foo' ],
+ [ 'http://', 'example.com/foo', 'http://example.com./foo' ],
+ [ 'http://', '127.0.0.1./foo', 'http://127.0.0.1/foo' ],
+ [ 'http://', '127.0.0.1/foo', 'http://127.0.0.1./foo' ],
// Tests for false positives
- [ 'http://', 'test.com', 'http://www.test.com', false ],
- [ 'http://', 'www1.test.com', 'http://www.test.com', false ],
- [ 'http://', '*.test.com', 'http://www.test.t.com', false ],
- [ '', 'http://test.com:8080', 'http://www.test.com:8080', false ],
- [ '', 'https://test.com', 'http://test.com', false ],
- [ '', 'http://test.com', 'https://test.com', false ],
- [ 'http://', 'http://test.com', 'http://test.com', false ],
- [ null, 'http://www.test.com', 'http://www.test.com:80', false ],
- [ null, 'http://www.test.com:80', 'http://www.test.com', false ],
- [ null, 'http://*.test.com:80', 'http://www.test.com', false ],
+ [ 'http://', 'test.com', 'http://www.test.com', [ 'found' => false ] ],
+ [ 'http://', 'www1.test.com', 'http://www.test.com', [ 'found' => false ] ],
+ [ 'http://', '*.test.com', 'http://www.test.t.com', [ 'found' => false ] ],
+ [ 'http://', 'test.com', 'http://xtest.com', [ 'found' => false ] ],
+ [ 'http://', '*.test.com', 'http://xtest.com', [ 'found' => false ] ],
+ [ 'http://', '.test.com', 'http://test.com', [ 'found' => false ] ],
+ [ 'http://', '.test.com', 'http://www.test.com', [ 'found' => false ] ],
+ [ 'http://', '*..test.com', 'http://test.com', [ 'found' => false ] ],
+ [ 'http://', '*..test.com', 'http://www.test.com', [ 'found' => false ] ],
+ [ '', 'http://test.com:8080', 'http://www.test.com:8080', [ 'found' => false ] ],
+ [ '', 'https://test.com', 'http://test.com', [ 'found' => false ] ],
+ [ '', 'http://test.com', 'https://test.com', [ 'found' => false ] ],
+ [ 'http://', 'http://test.com', 'http://test.com', [ 'found' => false ] ],
+ [ null, 'http://www.test.com', 'http://www.test.com:80', [ 'found' => false ] ],
+ [ null, 'http://www.test.com:80', 'http://www.test.com', [ 'found' => false ] ],
+ [ null, 'http://*.test.com:80', 'http://www.test.com', [ 'found' => false ] ],
[ '', 'https://gerrit.wikimedia.org/r/#/XXX/status:open,n,z',
- 'https://gerrit.wikimedia.org/r/#/q/status:open,n,z', false ],
+ 'https://gerrit.wikimedia.org/r/#/q/status:open,n,z', [ 'found' => false ] ],
[ '', 'https://*.wikimedia.org/r/#/q/status:open,n,z',
- 'https://gerrit.wikimedia.org/r/#/XXX/status:open,n,z', false ],
- [ 'mailto:', '@test.com', '@abc.test.com', false ],
- [ 'mailto:', 'mail@test.com', 'mail2@test.com', false ],
- [ '', 'mailto:mail@test.com', 'mail2@test.com', false ],
- [ '', 'mailto:@test.com', '@abc.test.com', false ],
- [ 'ftp://', '*.co', 'ftp://www.co.uk', false ],
- [ 'ftp://', '*.co', 'ftp://www.co.m', false ],
- [ 'ftp://', '*.co/dir/', 'ftp://www.co/dir2/', false ],
- [ 'ftp://', 'www.co/dir/', 'ftp://www.co/dir2/', false ],
- [ 'ftp://', 'test.com/dir/', 'ftp://test.com/', false ],
- [ '', 'http://test.com:8080/dir/', 'http://test.com:808/dir/', false ],
- [ '', 'http://test.com/dir/index.html', 'http://test.com/dir/index.php', false ],
+ 'https://gerrit.wikimedia.org/r/#/XXX/status:open,n,z', [ 'found' => false ] ],
+ [ 'mailto:', '@test.com', '@abc.test.com', [ 'found' => false ] ],
+ [ 'mailto:', 'mail@test.com', 'mail2@test.com', [ 'found' => false ] ],
+ [ '', 'mailto:mail@test.com', 'mail2@test.com', [ 'found' => false ] ],
+ [ '', 'mailto:@test.com', '@abc.test.com', [ 'found' => false ] ],
+ [ 'ftp://', '*.co', 'ftp://www.co.uk', [ 'found' => false ] ],
+ [ 'ftp://', '*.co', 'ftp://www.co.m', [ 'found' => false ] ],
+ [ 'ftp://', '*.co/dir/', 'ftp://www.co/dir2/', [ 'found' => false ] ],
+ [ 'ftp://', 'www.co/dir/', 'ftp://www.co/dir2/', [ 'found' => false ] ],
+ [ 'ftp://', 'test.com/dir/', 'ftp://test.com/', [ 'found' => false ] ],
+ [ '', 'http://test.com:8080/dir/', 'http://test.com:808/dir/', [ 'found' => false ] ],
+ [ '', 'http://test.com/dir/index.html', 'http://test.com/dir/index.php', [ 'found' => false ] ],
+ [ 'http://', '127.0.0.*', 'http://127.0.1.0', [ 'found' => false ] ],
+ [ 'http://', '[2001:db8::*]', 'http://[2001:0DB8::123:456]', [ 'found' => false ] ],
// These are false positives too and ideally shouldn't match, but that
// would require using regexes and RLIKE instead of LIKE
- // [ null, 'http://*.test.com', 'http://www.test.com:80', false ],
+ // [ null, 'http://*.test.com', 'http://www.test.com:80', [ 'found' => false ] ],
// [ '', 'https://*.wikimedia.org/r/#/q/status:open,n,z',
- // 'https://gerrit.wikimedia.org/XXX/r/#/q/status:open,n,z', false ],
+ // 'https://gerrit.wikimedia.org/XXX/r/#/q/status:open,n,z', [ 'found' => false ] ],
];
}
* testMakeLikeArrayWithValidPatterns()
*
* Tests whether the LIKE clause produced by LinkFilter::makeLikeArray($pattern, $protocol)
- * will find one of the URL indexes produced by wfMakeUrlIndexes($url)
+ * will find one of the URL indexes produced by LinkFilter::makeIndexes($url)
*
* @dataProvider provideValidPatterns
*
* @param string $protocol Protocol, e.g. 'http://' or 'mailto:'
* @param string $pattern Search pattern to feed to LinkFilter::makeLikeArray
- * @param string $url URL to feed to wfMakeUrlIndexes
- * @param bool $shouldBeFound Should the URL be found? (defaults true)
+ * @param string $url URL to feed to LinkFilter::makeIndexes
+ * @param array $options
+ * - found: (bool) Should the URL be found? (defaults true)
+ * - idn: (bool) Does this test require the idn conversion (default false)
*/
- function testMakeLikeArrayWithValidPatterns( $protocol, $pattern, $url, $shouldBeFound = true ) {
- $indexes = wfMakeUrlIndexes( $url );
+ function testMakeLikeArrayWithValidPatterns( $protocol, $pattern, $url, $options = [] ) {
+ $options += [ 'found' => true, 'idn' => false ];
+ if ( !empty( $options['idn'] ) && !LinkFilter::supportsIDN() ) {
+ $this->markTestSkipped( 'LinkFilter IDN support is not available' );
+ }
+
+ $indexes = LinkFilter::makeIndexes( $url );
$likeArray = LinkFilter::makeLikeArray( $pattern, $protocol );
$this->assertTrue( $likeArray !== false,
$regex = $this->createRegexFromLIKE( $likeArray );
$debugmsg = "Regex: '" . $regex . "'\n";
- $debugmsg .= count( $indexes ) . " index(es) created by wfMakeUrlIndexes():\n";
+ $debugmsg .= count( $indexes ) . " index(es) created by LinkFilter::makeIndexes():\n";
$matches = 0;
$debugmsg .= "\t'$index'\n";
}
- if ( $shouldBeFound ) {
+ if ( !empty( $options['found'] ) ) {
$this->assertTrue(
$matches > 0,
"Search pattern '$protocol$pattern' does not find url '$url' \n$debugmsg"
);
}
+ /**
+ * @dataProvider provideMakeIndexes()
+ * @covers LinkFilter::makeIndexes
+ */
+ public function testMakeIndexes( $url, $expected ) {
+ // Set global so file:// tests can work
+ $this->setMwGlobals( [
+ 'wgUrlProtocols' => [
+ 'http://',
+ 'https://',
+ 'mailto:',
+ '//',
+ 'file://', # Non-default
+ ],
+ ] );
+
+ $index = LinkFilter::makeIndexes( $url );
+ $this->assertEquals( $expected, $index, "LinkFilter::makeIndexes(\"$url\")" );
+ }
+
+ public static function provideMakeIndexes() {
+ return [
+ // Testcase for T30627
+ [
+ 'https://example.org/test.cgi?id=12345',
+ [ 'https://org.example./test.cgi?id=12345' ]
+ ],
+ [
+ // mailtos are handled special
+ 'mailto:wiki@wikimedia.org',
+ [ 'mailto:org.wikimedia.@wiki' ]
+ ],
+ [
+ // mailtos are handled special
+ 'mailto:wiki',
+ [ 'mailto:@wiki' ]
+ ],
+
+ // file URL cases per T30627...
+ [
+ // three slashes: local filesystem path Unix-style
+ 'file:///whatever/you/like.txt',
+ [ 'file://./whatever/you/like.txt' ]
+ ],
+ [
+ // three slashes: local filesystem path Windows-style
+ 'file:///c:/whatever/you/like.txt',
+ [ 'file://./c:/whatever/you/like.txt' ]
+ ],
+ [
+ // two slashes: UNC filesystem path Windows-style
+ 'file://intranet/whatever/you/like.txt',
+ [ 'file://intranet./whatever/you/like.txt' ]
+ ],
+ // Multiple-slash cases that can sorta work on Mozilla
+ // if you hack it just right are kinda pathological,
+ // and unreliable cross-platform or on IE which means they're
+ // unlikely to appear on intranets.
+ // Those will survive the algorithm but with results that
+ // are less consistent.
+
+ // protocol-relative URL cases per T31854...
+ [
+ '//example.org/test.cgi?id=12345',
+ [
+ 'http://org.example./test.cgi?id=12345',
+ 'https://org.example./test.cgi?id=12345'
+ ]
+ ],
+
+ // IP addresses
+ [
+ 'http://192.0.2.0/foo',
+ [ 'http://V4.192.0.2.0./foo' ]
+ ],
+ [
+ 'http://192.0.0002.0/foo',
+ [ 'http://V4.192.0.2.0./foo' ]
+ ],
+ [
+ 'http://[2001:db8::1]/foo',
+ [ 'http://V6.2001.DB8.0.0.0.0.0.1./foo' ]
+ ],
+
+ // Explicit specification of the DNS root
+ [
+ 'http://example.com./foo',
+ [ 'http://com.example./foo' ]
+ ],
+ [
+ 'http://192.0.2.0./foo',
+ [ 'http://V4.192.0.2.0./foo' ]
+ ],
+
+ // Weird edge case
+ [
+ 'http://.example.com/foo',
+ [ 'http://com.example../foo' ]
+ ],
+ ];
+ }
+
+ /**
+ * @dataProvider provideGetQueryConditions
+ * @covers LinkFilter::getQueryConditions
+ */
+ public function testGetQueryConditions( $query, $options, $expected ) {
+ $conds = LinkFilter::getQueryConditions( $query, $options );
+ $this->assertEquals( $expected, $conds );
+ }
+
+ public static function provideGetQueryConditions() {
+ return [
+ 'Basic example' => [
+ 'example.com',
+ [],
+ [
+ 'el_index_60 LIKE \'http://com.example./%\' ESCAPE \'`\' ',
+ 'el_index LIKE \'http://com.example./%\' ESCAPE \'`\' ',
+ ],
+ ],
+ 'Basic example with path' => [
+ 'example.com/foobar',
+ [],
+ [
+ 'el_index_60 LIKE \'http://com.example./foobar%\' ESCAPE \'`\' ',
+ 'el_index LIKE \'http://com.example./foobar%\' ESCAPE \'`\' ',
+ ],
+ ],
+ 'Wildcard domain' => [
+ '*.example.com',
+ [],
+ [
+ 'el_index_60 LIKE \'http://com.example.%\' ESCAPE \'`\' ',
+ 'el_index LIKE \'http://com.example.%\' ESCAPE \'`\' ',
+ ],
+ ],
+ 'Wildcard domain with path' => [
+ '*.example.com/foobar',
+ [],
+ [
+ 'el_index_60 LIKE \'http://com.example.%\' ESCAPE \'`\' ',
+ 'el_index LIKE \'http://com.example.%/foobar%\' ESCAPE \'`\' ',
+ ],
+ ],
+ 'Wildcard domain with path, oneWildcard=true' => [
+ '*.example.com/foobar',
+ [ 'oneWildcard' => true ],
+ [
+ 'el_index_60 LIKE \'http://com.example.%\' ESCAPE \'`\' ',
+ 'el_index LIKE \'http://com.example.%\' ESCAPE \'`\' ',
+ ],
+ ],
+ 'Constant prefix' => [
+ 'example.com/blah/blah/blah/blah/blah/blah/blah/blah/blah/blah?foo=',
+ [],
+ [
+ 'el_index_60' => 'http://com.example./blah/blah/blah/blah/blah/blah/blah/blah/',
+ 'el_index LIKE ' .
+ '\'http://com.example./blah/blah/blah/blah/blah/blah/blah/blah/blah/blah?foo=%\' ' .
+ 'ESCAPE \'`\' ',
+ ],
+ ],
+ 'Bad protocol' => [
+ 'test/',
+ [ 'protocol' => 'invalid://' ],
+ false
+ ],
+ 'Various options' => [
+ 'example.com',
+ [ 'protocol' => 'https://', 'prefix' => 'xx' ],
+ [
+ 'xx_index_60 LIKE \'https://com.example./%\' ESCAPE \'`\' ',
+ 'xx_index LIKE \'https://com.example./%\' ESCAPE \'`\' ',
+ ],
+ ],
+ ];
+ }
+
}
'http://example.org/%23%2F%3F%26%3D%2B%3B?%23%2F%3F%26%3D%2B%3B#%23%2F%3F%26%3D%2B%3B',
'http://example.org/%23%2F%3F&=+;?%23/?%26%3D%2B%3B#%23/?&=+;',
],
+ [
+ 'IPv6 links aren\'t escaped',
+ 'http://[::1]/foobar',
+ 'http://[::1]/foobar',
+ ],
+ [
+ 'non-IPv6 links aren\'t unescaped',
+ 'http://%5B::1%5D/foobar',
+ 'http://%5B::1%5D/foobar',
+ ],
];
}