From: Brion Vibber Date: Fri, 2 Jun 2006 20:54:34 +0000 (+0000) Subject: * Improve handling of ;: definition list construct with overlapping or nested HTML... X-Git-Tag: 1.31.0-rc.0~56924 X-Git-Url: http://git.cyclocoop.org/url?a=commitdiff_plain;h=ff807a12adbf48c8e5d98cd9626c05999230321b;p=lhc%2Fweb%2Fwiklou.git * Improve handling of ;: definition list construct with overlapping or nested HTML tags --- diff --git a/RELEASE-NOTES b/RELEASE-NOTES index 13872c83ce..792dda3c52 100644 --- a/RELEASE-NOTES +++ b/RELEASE-NOTES @@ -410,6 +410,9 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN * (bug 6164) Fix regression with resetting state * Hackaround for IE 7 wrapping bug in MonoBook footer * New message sp-newimages-showfrom replaces rclistfrom on special:newimages +* Improve handling of ;: definition list construct with overlapping or + nested HTML tags + == Compatibility == diff --git a/includes/Parser.php b/includes/Parser.php index eb7a78df22..54b7e9b770 100644 --- a/includes/Parser.php +++ b/includes/Parser.php @@ -59,6 +59,16 @@ define( 'EXT_IMAGE_REGEX', '('.EXT_IMAGE_FNAME_CLASS.'+)\\.((?i)'.EXT_IMAGE_EXTENSIONS.')$/S' # Filename ); +// State constants for the definition list colon extraction +define( 'MW_COLON_STATE_TEXT', 0 ); +define( 'MW_COLON_STATE_TAG', 1 ); +define( 'MW_COLON_STATE_TAGSTART', 2 ); +define( 'MW_COLON_STATE_CLOSETAG', 3 ); +define( 'MW_COLON_STATE_TAGSLASH', 4 ); +define( 'MW_COLON_STATE_COMMENT', 5 ); +define( 'MW_COLON_STATE_COMMENTDASH', 6 ); +define( 'MW_COLON_STATE_COMMENTDASHDASH', 7 ); + /** * PHP Parser * @@ -1963,43 +1973,142 @@ class Parser } /** - * Split up a string on ':', ignoring any occurences inside - * .. or ... + * Split up a string on ':', ignoring any occurences inside tags + * to prevent illegal overlapping. * @param string $str the string to split * @param string &$before set to everything before the ':' * @param string &$after set to everything after the ':' * return string the position of the ':', or false if none found */ function findColonNoLinks($str, &$before, &$after) { - # I wonder if we should make this count all tags, not just - # and . That would prevent us from matching a ':' that - # comes in the middle of italics other such formatting.... - # -- Wil $fname = 'Parser::findColonNoLinks'; wfProfileIn( $fname ); - $pos = 0; - do { - $colon = strpos($str, ':', $pos); - - if ($colon !== false) { - $before = substr($str, 0, $colon); - $after = substr($str, $colon + 1); - - # Skip any ':' within or pairs - $a = substr_count($before, ''); - $cs = substr_count($before, ''); - - if ($a <= $ca and $s <= $cs) { - # Tags are balanced before ':'; ok + + $pos = strpos( $str, ':' ); + if( $pos === false ) { + // Nothing to find! + wfProfileOut( $fname ); + return false; + } + + if( strpos( $str, '<' ) === false ) { + // Easy; no tag nesting to worry about + $before = substr( $str, 0, $pos ); + $after = substr( $str, $pos+1 ); + wfProfileOut( $fname ); + return $pos; + } + + // Ugly state machine to walk through avoiding tags. + $state = MW_COLON_STATE_TEXT; + $stack = 0; + $len = strlen( $str ); + for( $i = 0; $i < $len; $i++ ) { + $c = $str{$i}; + + switch( $state ) { + // (Using the number is a performance hack for common cases) + case 0: // MW_COLON_STATE_TEXT: + switch( $c ) { + case "<": + // Could be either a tag or an tag + $state = MW_COLON_STATE_TAGSTART; + break; + case ":": + if( $stack == 0 ) { + // We found it! + $before = substr( $str, 0, $i ); + $after = substr( $str, $i + 1 ); + wfProfileOut( $fname ); + return $i; + } + // Embedded in a tag; don't break it. break; + default: + // ignore } - $pos = $colon + 1; + break; + case 1: // MW_COLON_STATE_TAG: + // In a + switch( $c ) { + case ">": + $stack++; + $state = MW_COLON_STATE_TEXT; + break; + case "/": + // Slash may be followed by >? + $state = MW_COLON_STATE_TAGSLASH; + break; + default: + // ignore + } + break; + case 2: // MW_COLON_STATE_TAGSTART: + switch( $c ) { + case "/": + $state = MW_COLON_STATE_CLOSETAG; + break; + case "!": + $state = MW_COLON_STATE_COMMENT; + break; + case ">": + // Illegal early close? This shouldn't happen D: + $state = MW_COLON_STATE_TEXT; + break; + default: + $state = MW_COLON_STATE_TAG; + } + break; + case 3: // MW_COLON_STATE_CLOSETAG: + // In a + if( $c == ">" ) { + $stack--; + if( $stack < 0 ) { + wfDebug( "Invalid input in $fname; too many close tags\n" ); + wfProfileOut( $fname ); + return false; + } + $state = MW_COLON_STATE_TEXT; + } + break; + case MW_COLON_STATE_TAGSLASH: + if( $c == ">" ) { + // Yes, a self-closed tag + $state = MW_COLON_STATE_TEXT; + } else { + // Probably we're jumping the gun, and this is an attribute + $state = MW_COLON_STATE_TAG; + } + break; + case 5: // MW_COLON_STATE_COMMENT: + if( $c == "-" ) { + $state = MW_COLON_STATE_COMMENTDASH; + } + break; + case MW_COLON_STATE_COMMENTDASH: + if( $c == "-" ) { + $state = MW_COLON_STATE_COMMENTDASHDASH; + } else { + $state = MW_COLON_STATE_COMMENT; + } + break; + case MW_COLON_STATE_COMMENTDASHDASH: + if( $c == ">" ) { + $state = MW_COLON_STATE_TEXT; + } else { + $state = MW_COLON_STATE_COMMENT; + } + break; + default: + wfDebugDieBacktrace( "State machine error in $fname" ); } - } while ($colon !== false); + } + if( $stack > 0 ) { + wfDebug( "Invalid input in $fname; not enough close tags (stack $stack, state $state)\n" ); + return false; + } wfProfileOut( $fname ); - return $colon; + return false; } /** diff --git a/maintenance/parserTests.txt b/maintenance/parserTests.txt index 0a31ec6a04..4abf3a8cdc 100644 --- a/maintenance/parserTests.txt +++ b/maintenance/parserTests.txt @@ -406,6 +406,27 @@ Definition lists: colon in external link text !! end +!! test +Definition lists: colon in HTML attribute +!! input +;bold +!! result +
bold +
+ +!! end + + +!! test +Definition lists: self-closed tag +!! input +;one
two : two-line fun +!! result +
one
two 
two-line fun +
+ +!! end + ### ### External links