* Switch XML type detection/validity check from dipping for XML processing instructions, doctypes, or subtags to just trying to parse it and checking the root element's name and namespace. This lets us properly handle SVG files which specify a namespace but no doctype, as well as rejecting files that aren't well-formed. (See http://meta.wikimedia.org/wiki/SVG_validity_checks for some samples of bad files I encountered.) Non-XML files will abort parsing pretty quickly, so this shouldn't be a big burden on other types that didn't hit a magic check.
* Fix Unicode unix script checks (er.... is that even right? :D), remove the iconv dependency
'WikiErrorMsg' => 'includes/WikiError.php',
'WikiXmlError' => 'includes/WikiError.php',
'Xml' => 'includes/Xml.php',
+ 'XmlTypeCheck' => 'includes/XmlTypeCheck.php',
'ZhClient' => 'includes/ZhClient.php',
'memcached' => 'includes/memcached-client.php',
'EmaillingJob' => 'includes/JobQueue.php',
/*
* look for XML formats (XHTML and SVG)
*/
- $xml_type = NULL;
- if ( substr( $head, 0, 5 ) == "<?xml" ) {
- $xml_type = "ASCII";
- } elseif ( substr( $head, 0, 8 ) == "\xef\xbb\xbf<?xml") {
- $xml_type = "UTF-8";
- } elseif ( substr( $head, 0, 12 ) == "\xfe\xff\x00<\x00?\x00x\x00m\x00l" ) {
- $xml_type = "UTF-16BE";
- } elseif ( substr( $head, 0, 12 ) == "\xff\xfe<\x00?\x00x\x00m\x00l\x00") {
- $xml_type = "UTF-16LE";
- } else {
- /*
- echo "WARNING: Undetected xml_type ...\n";
- for( $i = 0; $i < 10; $i++ ) {
- $c = ord( $head{$i} );
- if( $c < 32 || $c > 126 ) {
- printf( "\\x%02x", $c );
- } else {
- print $head{$i};
- }
- }
- echo "\n";
- */
- }
-
- if( $xml_type == 'UTF-16BE' || $xml_type == 'UTF-16LE' ) {
- // Quick and dirty fold down to ASCII!
- $pack = array( 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' );
- $chars = unpack( $pack[$xml_type], substr( $head, 2 ) );
- $head = '';
- foreach( $chars as $codepoint ) {
- if( $codepoint < 128 ) {
- $head .= chr( $codepoint );
- } else {
- $head .= '?';
- }
- }
- }
-
- $match = array();
- $doctype = "";
- $tag = "";
-
- if ( preg_match( '%<!DOCTYPE\s+[\w-]+\s+PUBLIC\s+["'."'".'"](.*?)["'."'".'"].*>%siD',
- $head, $match ) ) {
- $doctype = $match[1];
- }
-
- if( $xml_type || $doctype ) {
- if ( preg_match( '%<(\w+)\b%si', $head, $match ) ) {
- $tag = $match[1];
- }
-
- #print "<br>ANALYSING $file: doctype= $doctype; tag= $tag<br>";
-
- if ( strpos( $doctype, "-//W3C//DTD SVG" ) === 0 ) {
- return "image/svg+xml";
- } elseif ( $tag === "svg" ) {
- return "image/svg+xml";
- } elseif ( strpos( $doctype, "-//W3C//DTD XHTML" ) === 0 ) {
- return "text/html";
- } elseif ( $tag === "html" ) {
- return "text/html";
+ $xml = new XmlTypeCheck( $file );
+ if( $xml->wellFormed ) {
+ $types = array(
+ 'http://www.w3.org/2000/svg:svg' => 'image/svg+xml',
+ 'svg' => 'image/svg+xml',
+ 'http://www.w3.org/1999/xhtml:html' => 'text/html', // application/xhtml+xml?
+ 'html' => 'text/html', // application/xhtml+xml?
+ );
+ if( isset( $types[$xml->rootElement] ) ) {
+ $mime = $types[$xml->rootElement];
+ return $mime;
} else {
/// Fixme -- this would be the place to allow additional XML type checks
- return "application/xml";
+ return 'application/xml';
}
}
if ( $script_type ) {
if ( $script_type !== "UTF-8" && $script_type !== "ASCII") {
- $head = iconv( $script_type, "ASCII//IGNORE", $head);
+ // Quick and dirty fold down to ASCII!
+ $pack = array( 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' );
+ $chars = unpack( $pack[$script_type], substr( $head, 2 ) );
+ $head = '';
+ foreach( $chars as $codepoint ) {
+ if( $codepoint < 128 ) {
+ $head .= chr( $codepoint );
+ } else {
+ $head .= '?';
+ }
+ }
}
$match = array();
--- /dev/null
+<?php
+
+class XmlTypeCheck {
+ /**
+ * Will be set to true or false to indicate whether the file is
+ * well-formed XML. Note that this doesn't check schema validity.
+ */
+ public $wellFormed = false;
+
+ /**
+ * Name of the document's root element, including any namespace
+ * as an expanded URL.
+ */
+ public $rootElement = '';
+
+ private $softNamespaces;
+ private $namespaces = array();
+
+ /**
+ * @param $file string filename
+ * @param $softNamespaces bool
+ * If set to true, use of undeclared XML namespaces will be ignored.
+ * This matches the behavior of rsvg, but more compliant consumers
+ * such as Firefox will reject such files.
+ * Leave off for the default, stricter checks.
+ */
+ function __construct( $file, $softNamespaces=false ) {
+ $this->softNamespaces = $softNamespaces;
+ $this->run( $file );
+ }
+
+ private function run( $fname ) {
+ if( $this->softNamespaces ) {
+ $parser = xml_parser_create( 'UTF-8' );
+ } else {
+ $parser = xml_parser_create_ns( 'UTF-8' );
+ }
+
+ // case folding violates XML standard, turn it off
+ xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
+
+ xml_set_element_handler( $parser, array( $this, 'elementOpen' ), false );
+
+ $file = fopen( $fname, "rb" );
+ do {
+ $chunk = fread( $file, 32768 );
+ $ret = xml_parse( $parser, $chunk, feof( $file ) );
+ if( $ret == 0 ) {
+ // XML isn't well-formed!
+ fclose( $file );
+ xml_parser_free( $parser );
+ return;
+ }
+ } while( !feof( $file ) );
+
+ $this->wellFormed = true;
+
+ fclose( $file );
+ xml_parser_free( $parser );
+ }
+
+ private function elementOpen( $parser, $name, $attribs ) {
+ if( $this->softNamespaces ) {
+ // Check namespaces manually, so expat doesn't throw
+ // errors on use of undeclared namespaces.
+ foreach( $attribs as $attrib => $val ) {
+ if( $attrib == 'xmlns' ) {
+ $this->namespaces[''] = $val;
+ } elseif( substr( $attrib, 0, strlen( 'xmlns:' ) ) == 'xmlns:' ) {
+ $this->namespaces[substr( $attrib, strlen( 'xmlns:' ) )] = $val;
+ }
+ }
+
+ if( strpos( $name, ':' ) === false ) {
+ $ns = '';
+ $subname = $name;
+ } else {
+ list( $ns, $subname ) = explode( ':', $name, 2 );
+ }
+
+ if( isset( $this->namespaces[$ns] ) ) {
+ $name = $this->namespaces[$ns] . ':' . $subname;
+ } else {
+ // Technically this is invalid for XML with Namespaces.
+ // But..... we'll just let it slide in soft mode.
+ }
+ }
+
+ // We only need the first open element
+ $this->rootElement = $name;
+ xml_set_element_handler( $parser, false, false );
+ }
+}