(X)HTML parser * Based on work by Jan Hidders and Magnus Manske * @package MediaWiki * @subpackage Experimental */ /** * the base class for an element */ class element { var $name = ''; var $attrs = array(); var $children = array(); function sub_makeXHTML ( &$parser , $tag = "" , $attr = "" ) { $ret = "" ; if ( $tag != "" ) { $ret .= "<" . $tag ; if ( $attr != "" ) $ret .= " " . $attr ; $ret .= ">" ; } foreach ($this->children as $child) { if ( is_string($child) ) { $ret .= $child ; } else { $ret .= $child->makeXHTML ( $parser ); } } if ( $tag != "" ) $ret .= "\n" ; return $ret ; } function createInternalLink ( &$parser , $target , $display_title , $options ) { $tp = explode ( ":" , $target ) ; # tp = target parts $title = "" ; # The plain title $language = "" ; # The language/meta/etc. part $namespace = "" ; # The namespace, if any $subtarget = "" ; # The '#' thingy if ( count ( $tp ) == 1 ) $title = $target ; # Plain and simple case else { # To be implemented } if ( $language != "" ) # External link within the WikiMedia project { return "{language link}" ; } else if ( $namespace != "" ) # Link to another namespace, check for image/media stuff { return "{namespace link}" ; } else { return "{internal link}" ; } } function makeInternalLink ( &$parser ) { $target = "" ; $option = array () ; foreach ($this->children as $child) { if ( is_string($child) ) { # This shouldn't be the case! } else { if ( $child->name == "LINKTARGET" ) $target = trim ( $child->makeXHTML ( $parser ) ) ; else $option[] = trim ( $child->makeXHTML ( $parser ) ) ; } } if ( count ( $option ) == 0 ) $option[] = $target ; # Create dummy display title $display_title = array_pop ( $option ) ; return $this->createInternalLink ( $parser , $target , $display_title , $option ) ; } function makeXHTML ( &$parser ) { $ret = "" ; $n = $this->name ; # Shortcut if ( $n == "ARTICLE" ) $ret .= $this->sub_makeXHTML ( $parser ) ; else if ( $n == "HEADING" ) $ret .= $this->sub_makeXHTML ( $parser , "h" . $this->attrs["LEVEL"] ) ; else if ( $n == "PARAGRAPH" ) $ret .= $this->sub_makeXHTML ( $parser , "p" ) ; else if ( $n == "BOLD" ) $ret .= $this->sub_makeXHTML ( $parser , "strong" ) ; else if ( $n == "ITALICS" ) $ret .= $this->sub_makeXHTML ( $parser , "em" ) ; else if ( $n == "LINK" ) $ret .= $this->makeInternalLink ( $parser ) ; else if ( $n == "LINKTARGET" ) $ret .= $this->sub_makeXHTML ( $parser ) ; else if ( $n == "LINKOPTION" ) $ret .= $this->sub_makeXHTML ( $parser ) ; else if ( $n == "EXTENSION" ) # This is currently a dummy!!! { $ext = $this->attrs["NAME"] ; $ret .= "<" . $ext . ">" ; $ret .= $this->sub_makeXHTML ( $parser ) ; $ret .= "</" . $ext . "> " ; } else if ( $n == "TABLE" ) { $ret .= $this->sub_makeXHTML ( $parser , "table" ) ; } else if ( $n == "TABLEROW" ) { $ret .= $this->sub_makeXHTML ( $parser , "tr" ) ; } else if ( $n == "TABLECELL" ) { $ret .= $this->sub_makeXHTML ( $parser , "td" ) ; } else if ( $n == "LISTITEM" ) $ret .= $this->sub_makeXHTML ( $parser , "li" ) ; else if ( $n == "LIST" ) { $type = "ol" ; # Default if ( $this->attrs["TYPE"] == "bullet" ) $type = "ul" ; $ret .= $this->sub_makeXHTML ( $parser , $type ) ; } else { $ret .= "<" . $n . ">" ; $ret .= $this->sub_makeXHTML ( $parser ) ; $ret .= "</" . $n . "> " ; } $ret = "\n{$ret}\n" ; $ret = str_replace ( "\n\n" , "\n" , $ret ) ; return $ret ; } function myPrint() { $ret = "\n"; return $ret; } } $ancStack = array(); // the stack with ancestral elements // Three global functions needed for parsing, sorry guys function wgXMLstartElement($parser, $name, $attrs) { global $ancStack; $newElem = new element; $newElem->name = $name; $newElem->attrs = $attrs; array_push($ancStack, $newElem); } function wgXMLendElement($parser, $name) { global $ancStack, $rootElem; // pop element off stack $elem = array_pop ($ancStack); if (count ($ancStack) == 0) $rootElem = $elem; else // add it to its parent array_push ($ancStack[count($ancStack)-1]->children, $elem); } function wgXMLcharacterData($parser, $data) { global $ancStack; $data = trim ($data); // Don't add blank lines, they're no use... // add to parent if parent exists if ( $ancStack && $data != "" ) { array_push ($ancStack[count($ancStack)-1]->children, $data); } } /** * Here's the class that generates a nice tree */ class xml2php { function &scanFile( $filename ) { global $ancStack, $rootElem; $ancStack = array(); $xml_parser = xml_parser_create(); xml_set_element_handler ($xml_parser, 'wgXMLstartElement', 'wgXMLendElement'); xml_set_character_data_handler ($xml_parser, 'wgXMLcharacterData'); if (!($fp = fopen($filename, 'r'))) { die('could not open XML input'); } while ($data = fread($fp, 4096)) { if (!xml_parse($xml_parser, $data, feof($fp))) { die(sprintf("XML error: %s at line %d", xml_error_string(xml_get_error_code($xml_parser)), xml_get_current_line_number($xml_parser))); } } xml_parser_free($xml_parser); // return the remaining root element we copied in the beginning return $rootElem; } function scanString ( $input ) { global $ancStack, $rootElem; $ancStack = array(); $xml_parser = xml_parser_create(); xml_set_element_handler ($xml_parser, 'wgXMLstartElement', 'wgXMLendElement'); xml_set_character_data_handler ($xml_parser, 'wgXMLcharacterData'); if (!xml_parse ($xml_parser, $input, true)) { die (sprintf ("XML error: %s at line %d", xml_error_string(xml_get_error_code($xml_parser)), xml_get_current_line_number($xml_parser))); } xml_parser_free ($xml_parser); // return the remaining root element we copied in the beginning return $rootElem; } } /* Example code: $w = new xml2php; $filename = 'sample.xml'; $result = $w->scanFile( $filename ); print $result->myPrint(); */ $dummytext = "
R-type image:a.jpg123textThe video gamecomputer game R-type is cool & stuff because:it's niceit's fastit has:graphicssoundVersion 1 not badVersion 2 much better
This is a || token in the middle of text.
" ; class ParserXML EXTENDS Parser { /**#@+ * @access private */ # Persistent: var $mTagHooks; # Cleared with clearState(): var $mOutput, $mAutonumber, $mDTopen, $mStripState = array(); var $mVariables, $mIncludeCount, $mArgStack, $mLastSection, $mInPre; # Temporary: var $mOptions, $mTitle, $mOutputType, $mTemplates, // cache of already loaded templates, avoids // multiple SQL queries for the same string $mTemplatePath; // stores an unsorted hash of all the templates already loaded // in this path. Used for loop detection. /**#@-*/ /** * Constructor * * @access public */ function ParserXML() { $this->mTemplates = array(); $this->mTemplatePath = array(); $this->mTagHooks = array(); $this->clearState(); } /** * Clear Parser state * * @access private */ function clearState() { $this->mOutput = new ParserOutput; $this->mAutonumber = 0; $this->mLastSection = ""; $this->mDTopen = false; $this->mVariables = false; $this->mIncludeCount = array(); $this->mStripState = array(); $this->mArgStack = array(); $this->mInPre = false; } function parse( $text, &$title, $options, $linestart = true, $clearState = true ) { global $dummytext ; $text = $dummytext ; $w = new xml2php; $result = $w->scanString( $text ); $text .= "
" . $result->makeXHTML ( $this ); $text .= "
" . $result->myPrint(); $this->mOutput->setText ( $text ) ; return $this->mOutput; } } ?>