-style tags. This should not be anything we # may want to use in wikisyntax define( 'STRIP_COMMENTS', 'HTMLCommentStrip' ); # prefix for escaping, used in two functions at least define( 'UNIQ_PREFIX', 'NaodW29'); # Constants needed for external link processing define( 'URL_PROTOCOLS', 'http|https|ftp|irc|gopher|news|mailto' ); define( 'HTTP_PROTOCOLS', 'http|https' ); # Everything except bracket, space, or control characters define( 'EXT_LINK_URL_CLASS', '[^]\\x00-\\x20\\x7F]' ); define( 'INVERSE_EXT_LINK_URL_CLASS', '[\]\\x00-\\x20\\x7F]' ); # Including space define( 'EXT_LINK_TEXT_CLASS', '[^\]\\x00-\\x1F\\x7F]' ); define( 'EXT_IMAGE_FNAME_CLASS', '[A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]' ); define( 'EXT_IMAGE_EXTENSIONS', 'gif|png|jpg|jpeg' ); define( 'EXT_LINK_BRACKETED', '/\[(('.URL_PROTOCOLS.'):'.EXT_LINK_URL_CLASS.'+) *('.EXT_LINK_TEXT_CLASS.'*?)\]/S' ); define( 'EXT_IMAGE_REGEX', '/^('.HTTP_PROTOCOLS.':)'. # Protocol '('.EXT_LINK_URL_CLASS.'+)\\/'. # Hostname and path '('.EXT_IMAGE_FNAME_CLASS.'+)\\.((?i)'.EXT_IMAGE_EXTENSIONS.')$/S' # Filename ); /** * PHP Parser * * Processes wiki markup * *
* There are three main entry points into the Parser class: * parse() * produces HTML output * preSaveTransform(). * produces altered wiki markup. * transformMsg() * performs brace substitution on MediaWiki messages * * Globals used: * objects: $wgLang, $wgDateFormatter, $wgLinkCache, $wgCurParser * * NOT $wgArticle, $wgUser or $wgTitle. Keep them away! * * settings: * $wgUseTex*, $wgUseDynamicDates*, $wgInterwikiMagic*, * $wgNamespacesWithSubpages, $wgAllowExternalImages*, * $wgLocaltimezone * * * only within ParserOptions ** * @package MediaWiki */ class Parser { /**#@+ * @access private */ # Persistent: var $mTagHooks; # Cleared with clearState(): var $mOutput, $mAutonumber, $mDTopen, $mStripState = array(); var $mVariables, $mIncludeCount, $mArgStack, $mLastSection, $mInPre; # Temporary: var $mOptions, $mTitle, $mOutputType, $mTemplates, // cache of already loaded templates, avoids // multiple SQL queries for the same string $mTemplatePath; // stores an unsorted hash of all the templates already loaded // in this path. Used for loop detection. /**#@-*/ /** * Constructor * * @access public */ function Parser() { $this->mTemplates = array(); $this->mTemplatePath = array(); $this->mTagHooks = array(); $this->clearState(); } /** * Clear Parser state * * @access private */ function clearState() { $this->mOutput = new ParserOutput; $this->mAutonumber = 0; $this->mLastSection = ""; $this->mDTopen = false; $this->mVariables = false; $this->mIncludeCount = array(); $this->mStripState = array(); $this->mArgStack = array(); $this->mInPre = false; } /** * First pass--just handle
' . wfEscapeHTMLTagsOnly( $content ) . ''; } else { $pre_content[$marker] = '
'.$content.''; } } # Comments if($stripcomments) { $text = Parser::extractTags(STRIP_COMMENTS, $text, $comment_content, $uniq_prefix); foreach( $comment_content as $marker => $content ){ $comment_content[$marker] = ''; } } # Extensions foreach ( $this->mTagHooks as $tag => $callback ) { $ext_contents[$tag] = array(); $text = Parser::extractTags( $tag, $text, $ext_content[$tag], $uniq_prefix ); foreach( $ext_content[$tag] as $marker => $content ) { if ( $render ) { $ext_content[$tag][$marker] = $callback( $content ); } else { $ext_content[$tag][$marker] = "<$tag>$content$tag>"; } } } # Merge state with the pre-existing state, if there is one if ( $state ) { $state['html'] = $state['html'] + $html_content; $state['nowiki'] = $state['nowiki'] + $nowiki_content; $state['math'] = $state['math'] + $math_content; $state['pre'] = $state['pre'] + $pre_content; $state['comment'] = $state['comment'] + $comment_content; foreach( $ext_content as $tag => $array ) { if ( array_key_exists( $tag, $state ) ) { $state[$tag] = $state[$tag] + $array; } } } else { $state = array( 'html' => $html_content, 'nowiki' => $nowiki_content, 'math' => $math_content, 'pre' => $pre_content, 'comment' => $comment_content, ) + $ext_content; } return $text; } /** * restores pre, math, and heiro removed by strip() * * always call unstripNoWiki() after this one * @access private */ function unstrip( $text, &$state ) { # Must expand in reverse order, otherwise nested tags will be corrupted $contentDict = end( $state ); for ( $contentDict = end( $state ); $contentDict !== false; $contentDict = prev( $state ) ) { if( key($state) != 'nowiki' && key($state) != 'html') { for ( $content = end( $contentDict ); $content !== false; $content = prev( $contentDict ) ) { $text = str_replace( key( $contentDict ), $content, $text ); } } } return $text; } /** * always call this after unstrip() to preserve the order * * @access private */ function unstripNoWiki( $text, &$state ) { # Must expand in reverse order, otherwise nested tags will be corrupted for ( $content = end($state['nowiki']); $content !== false; $content = prev( $state['nowiki'] ) ) { $text = str_replace( key( $state['nowiki'] ), $content, $text ); } global $wgRawHtml; if ($wgRawHtml) { for ( $content = end($state['html']); $content !== false; $content = prev( $state['html'] ) ) { $text = str_replace( key( $state['html'] ), $content, $text ); } } return $text; } /** * Add an item to the strip state * Returns the unique tag which must be inserted into the stripped text * The tag will be replaced with the original text in unstrip() * * @access private */ function insertStripItem( $text, &$state ) { $rnd = UNIQ_PREFIX . '-item' . Parser::getRandomString(); if ( !$state ) { $state = array( 'html' => array(), 'nowiki' => array(), 'math' => array(), 'pre' => array() ); } $state['item'][$rnd] = $text; return $rnd; } /** * Return allowed HTML attributes * * @access private */ function getHTMLattrs () { $htmlattrs = array( # Allowed attributes--no scripting, etc. 'title', 'align', 'lang', 'dir', 'width', 'height', 'bgcolor', 'clear', /* BR */ 'noshade', /* HR */ 'cite', /* BLOCKQUOTE, Q */ 'size', 'face', 'color', /* FONT */ 'type', 'start', 'value', 'compact', /* For various lists, mostly deprecated but safe */ 'summary', 'width', 'border', 'frame', 'rules', 'cellspacing', 'cellpadding', 'valign', 'char', 'charoff', 'colgroup', 'col', 'span', 'abbr', 'axis', 'headers', 'scope', 'rowspan', 'colspan', /* Tables */ 'id', 'class', 'name', 'style' /* For CSS */ ); return $htmlattrs ; } /** * Remove non approved attributes and javascript in css * * @access private */ function fixTagAttributes ( $t ) { if ( trim ( $t ) == '' ) return '' ; # Saves runtime ;-) $htmlattrs = $this->getHTMLattrs() ; # Strip non-approved attributes from the tag $t = preg_replace( '/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e', "(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')", $t); $t = str_replace ( '<>>' , '' , $t ) ; # This should fix bug 980557 # Strip javascript "expression" from stylesheets. Brute force approach: # If anythin offensive is found, all attributes of the HTML tag are dropped if( preg_match( '/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is', wfMungeToUtf8( $t ) ) ) { $t=''; } return trim ( $t ) ; } /** * interface with html tidy, used if $wgUseTidy = true * * @access public * @static */ function tidy ( $text ) { global $wgTidyConf, $wgTidyBin, $wgTidyOpts; global $wgInputEncoding, $wgOutputEncoding; $fname = 'Parser::tidy'; wfProfileIn( $fname ); $cleansource = ''; $opts = ''; switch(strtoupper($wgOutputEncoding)) { case 'ISO-8859-1': $opts .= ($wgInputEncoding == $wgOutputEncoding)? ' -latin1':' -raw'; break; case 'UTF-8': $opts .= ($wgInputEncoding == $wgOutputEncoding)? ' -utf8':' -raw'; break; default: $opts .= ' -raw'; } $wrappedtext = ''. '
mInPre ) { # Multiple prefixes may abut each other for nested lists. $prefixLength = strspn( $oLine, '*#:;' ); $pref = substr( $oLine, 0, $prefixLength ); # eh? $pref2 = str_replace( ';', ':', $pref ); $t = substr( $oLine, $prefixLength ); $this->mInPre = !empty($preOpenMatch); } else { # Don't interpret any other prefixes in preformatted text $prefixLength = 0; $pref = $pref2 = ''; $t = $oLine; } # List generation if( $prefixLength && 0 == strcmp( $lastPrefix, $pref2 ) ) { # Same as the last item, so no need to deal with nesting or opening stuff $output .= $this->nextItem( substr( $pref, -1 ) ); $paragraphStack = false; if ( substr( $pref, -1 ) == ';') { # The one nasty exception: definition lists work like this: # ; title : definition text # So we check for : in the remainder text to split up the # title and definition, without b0rking links. if ($this->findColonNoLinks($t, $term, $t2) !== false) { $t = $t2; $output .= $term . $this->nextItem( ':' ); } } } elseif( $prefixLength || $lastPrefixLength ) { # Either open or close a level... $commonPrefixLength = $this->getCommon( $pref, $lastPrefix ); $paragraphStack = false; while( $commonPrefixLength < $lastPrefixLength ) { $output .= $this->closeList( $lastPrefix{$lastPrefixLength-1} ); --$lastPrefixLength; } if ( $prefixLength <= $commonPrefixLength && $commonPrefixLength > 0 ) { $output .= $this->nextItem( $pref{$commonPrefixLength-1} ); } while ( $prefixLength > $commonPrefixLength ) { $char = substr( $pref, $commonPrefixLength, 1 ); $output .= $this->openList( $char ); if ( ';' == $char ) { # FIXME: This is dupe of code above if ($this->findColonNoLinks($t, $term, $t2) !== false) { $t = $t2; $output .= $term . $this->nextItem( ':' ); } } ++$commonPrefixLength; } $lastPrefix = $pref2; } if( 0 == $prefixLength ) { # No prefix (not in list)--go to paragraph mode $uniq_prefix = UNIQ_PREFIX; // XXX: use a stack for nestable elements like span, table and div $openmatch = preg_match('/(