Unicode space separator characters (Zs) now terminates links
authorAntoine Musso <hashar@users.mediawiki.org>
Wed, 27 Jul 2011 18:03:01 +0000 (18:03 +0000)
committerAntoine Musso <hashar@users.mediawiki.org>
Wed, 27 Jul 2011 18:03:01 +0000 (18:03 +0000)
Fix 19052 which was only reporting the issue for U+3000 IDEOGRAPHIC SPACE.
Covers both external links and images links. See parser tests for examples.

Unicode 'Zs' includes all characters from the 'separator, space' category.
Characters part of this category are:

Char    Name
U+0020  SPACE
U+00A0  NO-BREAK SPACE
U+1680  OGHAM SPACE MARK
U+180E  MONGOLIAN VOWEL SEPARATOR
U+2000  EN QUAD
U+2001  EM QUAD
U+2002  EN SPACE
U+2003  EM SPACE
U+2004  THREE-PER-EM SPACE
U+2005  FOUR-PER-EM SPACE
U+2006  SIX-PER-EM SPACE
U+2007  FIGURE SPACE
U+2008  PUNCTUATION SPACE
U+2009  THIN SPACE
U+200A  HAIR SPACE
U+202F  NARROW NO-BREAK SPACE
U+205F  MEDIUM MATHEMATICAL SPACE
U+3000  IDEOGRAPHIC SPACE

TEST PLAN:

$ php parserTests.php --quiet
This is MediaWiki version 1.19alpha (r93258).

Reading tests from "tests/parser/parserTests.txt"...
Reading tests from "tests/parser/extraParserTests.txt"...
Reading tests from "../mwexts/LabeledSectionTransclusion/lstParserTests.txt"...
Passed 686 of 686 tests (100%)... ALL TESTS PASSED!

Sounds good :-)

RELEASE-NOTES-1.19
includes/parser/Parser.php
tests/parser/parserTests.txt

index 3f4bacf..e8ec2f3 100644 (file)
@@ -20,6 +20,8 @@ production.
   elements to work with DNSBLs that require keys, such as
   Project Honeypot.
 * (bug 30022) Add support for custom loadScript sources to ResourceLoader.
+* (bug 19052) Unicode space separator characters (Zs) now terminates external
+  links and images links.
 
 === Bug fixes in 1.19 ===
 * $wgUploadNavigationUrl should be used for file redlinks if
index f19d96d..f03229a 100644 (file)
@@ -68,9 +68,11 @@ class Parser {
 
        # Constants needed for external link processing
        # Everything except bracket, space, or control characters
-       const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F]';
-       const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)([^][<>"\\x00-\\x20\\x7F]+)
-               \\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sx';
+       # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
+       # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
+       const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]';
+       const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)([^][<>"\\x00-\\x20\\x7F\p{Zs}]+)
+               \\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu';
 
        # State constants for the definition list colon extraction
        const COLON_STATE_TEXT = 0;
@@ -184,7 +186,7 @@ class Parser {
                $this->mConf = $conf;
                $this->mUrlProtocols = wfUrlProtocols();
                $this->mExtLinkBracketedRegex = '/\[((' . wfUrlProtocols() . ')'.
-                       '[^][<>"\\x00-\\x20\\x7F]+) *([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/S';
+                       self::EXT_LINK_URL_CLASS.'+)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su';
                if ( isset( $conf['preprocessorClass'] ) ) {
                        $this->mPreprocessorClass = $conf['preprocessorClass'];
                } elseif ( defined( 'MW_COMPILED' ) ) {
@@ -1197,7 +1199,7 @@ class Parser {
                                        (?: [0-9]  [\ \-]? ){9} # 9 digits with opt. delimiters
                                        [0-9Xx]                 # check digit
                                        \b)
-                       )!x', array( &$this, 'magicLinkCallback' ), $text );
+                       )!xu', array( &$this, 'magicLinkCallback' ), $text );
                wfProfileOut( __METHOD__ );
                return $text;
        }
@@ -4963,7 +4965,7 @@ class Parser {
                                                                $value = true;
                                                                $validated = true;
                                                        } elseif ( preg_match( "/^$prots/", $value ) ) {
-                                                               if ( preg_match( "/^($prots)$chars+$/", $value, $m ) ) {
+                                                               if ( preg_match( "/^($prots)$chars+$/u", $value, $m ) ) {
                                                                        $paramName = 'link-url';
                                                                        $this->mOutput->addExternalLink( $value );
                                                                        if ( $this->mOptions->getExternalLinkTarget() ) {
index c59e4db..ef5a663 100644 (file)
@@ -9248,6 +9248,32 @@ Text&apos;s been normalized?
 </p>
 !! end
 
+!! test
+Bug 19052 U+3000 IDEOGRAPHIC SPACE should terminate free external links
+!! input
+http://www.example.org/ <-- U+3000 (vim: ^Vu3000)
+!! result
+<p><a rel="nofollow" class="external free" href="http://www.example.org/">http://www.example.org/</a> &lt;-- U+3000 (vim: ^Vu3000)
+</p>
+!! end
+
+!! test
+Bug 19052 U+3000 IDEOGRAPHIC SPACE should terminate bracketed external links
+!! input
+[http://www.example.org/ ideograms]
+!! result
+<p><a rel="nofollow" class="external text" href="http://www.example.org/">ideograms</a>
+</p>
+!! end
+
+!! test
+Bug 19052 U+3000 IDEOGRAPHIC SPACE should terminate external images links
+!! input
+http://www.example.org/pic.png <-- U+3000 (vim: ^Vu3000)
+!! result
+<p><img src="http://www.example.org/pic.png" alt="pic.png" /> &lt;-- U+3000 (vim: ^Vu3000)
+</p>
+!! end
 
 TODO:
 more images