From: Antoine Musso Date: Wed, 27 Jul 2011 18:03:01 +0000 (+0000) Subject: Unicode space separator characters (Zs) now terminates links X-Git-Tag: 1.31.0-rc.0~28590 X-Git-Url: http://git.cyclocoop.org/%22%20.%20%20%20%24self2%20.%20%20%20%22&var_mode_affiche=boucle?a=commitdiff_plain;h=176f91596c80cf5e41484c7377d809d09f0ecbf7;p=lhc%2Fweb%2Fwiklou.git Unicode space separator characters (Zs) now terminates links Fix 19052 which was only reporting the issue for U+3000 IDEOGRAPHIC SPACE. Covers both external links and images links. See parser tests for examples. Unicode 'Zs' includes all characters from the 'separator, space' category. Characters part of this category are: Char Name U+0020 SPACE U+00A0 NO-BREAK SPACE U+1680 OGHAM SPACE MARK U+180E MONGOLIAN VOWEL SEPARATOR U+2000 EN QUAD U+2001 EM QUAD U+2002 EN SPACE U+2003 EM SPACE U+2004 THREE-PER-EM SPACE U+2005 FOUR-PER-EM SPACE U+2006 SIX-PER-EM SPACE U+2007 FIGURE SPACE U+2008 PUNCTUATION SPACE U+2009 THIN SPACE U+200A HAIR SPACE U+202F NARROW NO-BREAK SPACE U+205F MEDIUM MATHEMATICAL SPACE U+3000 IDEOGRAPHIC SPACE TEST PLAN: $ php parserTests.php --quiet This is MediaWiki version 1.19alpha (r93258). Reading tests from "tests/parser/parserTests.txt"... Reading tests from "tests/parser/extraParserTests.txt"... Reading tests from "../mwexts/LabeledSectionTransclusion/lstParserTests.txt"... Passed 686 of 686 tests (100%)... ALL TESTS PASSED! Sounds good :-) --- diff --git a/RELEASE-NOTES-1.19 b/RELEASE-NOTES-1.19 index 3f4bacfee6..e8ec2f37ff 100644 --- a/RELEASE-NOTES-1.19 +++ b/RELEASE-NOTES-1.19 @@ -20,6 +20,8 @@ production. elements to work with DNSBLs that require keys, such as Project Honeypot. * (bug 30022) Add support for custom loadScript sources to ResourceLoader. +* (bug 19052) Unicode space separator characters (Zs) now terminates external + links and images links. === Bug fixes in 1.19 === * $wgUploadNavigationUrl should be used for file redlinks if diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index f19d96d443..f03229aff7 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -68,9 +68,11 @@ class Parser { # Constants needed for external link processing # Everything except bracket, space, or control characters - const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F]'; - const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)([^][<>"\\x00-\\x20\\x7F]+) - \\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sx'; + # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20 + # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052 + const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]'; + const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)([^][<>"\\x00-\\x20\\x7F\p{Zs}]+) + \\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu'; # State constants for the definition list colon extraction const COLON_STATE_TEXT = 0; @@ -184,7 +186,7 @@ class Parser { $this->mConf = $conf; $this->mUrlProtocols = wfUrlProtocols(); $this->mExtLinkBracketedRegex = '/\[((' . wfUrlProtocols() . ')'. - '[^][<>"\\x00-\\x20\\x7F]+) *([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/S'; + self::EXT_LINK_URL_CLASS.'+)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su'; if ( isset( $conf['preprocessorClass'] ) ) { $this->mPreprocessorClass = $conf['preprocessorClass']; } elseif ( defined( 'MW_COMPILED' ) ) { @@ -1197,7 +1199,7 @@ class Parser { (?: [0-9] [\ \-]? ){9} # 9 digits with opt. delimiters [0-9Xx] # check digit \b) - )!x', array( &$this, 'magicLinkCallback' ), $text ); + )!xu', array( &$this, 'magicLinkCallback' ), $text ); wfProfileOut( __METHOD__ ); return $text; } @@ -4963,7 +4965,7 @@ class Parser { $value = true; $validated = true; } elseif ( preg_match( "/^$prots/", $value ) ) { - if ( preg_match( "/^($prots)$chars+$/", $value, $m ) ) { + if ( preg_match( "/^($prots)$chars+$/u", $value, $m ) ) { $paramName = 'link-url'; $this->mOutput->addExternalLink( $value ); if ( $this->mOptions->getExternalLinkTarget() ) { diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index c59e4db102..ef5a663b7d 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -9248,6 +9248,32 @@ Text's been normalized?

!! end +!! test +Bug 19052 U+3000 IDEOGRAPHIC SPACE should terminate free external links +!! input +http://www.example.org/ <-- U+3000 (vim: ^Vu3000) +!! result +

http://www.example.org/ <-- U+3000 (vim: ^Vu3000) +

+!! end + +!! test +Bug 19052 U+3000 IDEOGRAPHIC SPACE should terminate bracketed external links +!! input +[http://www.example.org/ ideograms] +!! result +

ideograms +

+!! end + +!! test +Bug 19052 U+3000 IDEOGRAPHIC SPACE should terminate external images links +!! input +http://www.example.org/pic.png <-- U+3000 (vim: ^Vu3000) +!! result +

pic.png <-- U+3000 (vim: ^Vu3000) +

+!! end TODO: more images