From fee0f038b8d8c9cd14da7de772116388b1ada59e Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Wed, 6 Feb 2008 01:23:12 +0000 Subject: [PATCH] More file type checks... * Switch XML type detection/validity check from dipping for XML processing instructions, doctypes, or subtags to just trying to parse it and checking the root element's name and namespace. This lets us properly handle SVG files which specify a namespace but no doctype, as well as rejecting files that aren't well-formed. (See http://meta.wikimedia.org/wiki/SVG_validity_checks for some samples of bad files I encountered.) Non-XML files will abort parsing pretty quickly, so this shouldn't be a big burden on other types that didn't hit a magic check. * Fix Unicode unix script checks (er.... is that even right? :D), remove the iconv dependency --- includes/AutoLoader.php | 1 + includes/MimeMagic.php | 87 ++++++++++-------------------------- includes/XmlTypeCheck.php | 93 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 64 deletions(-) create mode 100644 includes/XmlTypeCheck.php diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index 34c6704c5c..2e2083b2a2 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -271,6 +271,7 @@ function __autoload($className) { 'WikiErrorMsg' => 'includes/WikiError.php', 'WikiXmlError' => 'includes/WikiError.php', 'Xml' => 'includes/Xml.php', + 'XmlTypeCheck' => 'includes/XmlTypeCheck.php', 'ZhClient' => 'includes/ZhClient.php', 'memcached' => 'includes/memcached-client.php', 'EmaillingJob' => 'includes/JobQueue.php', diff --git a/includes/MimeMagic.php b/includes/MimeMagic.php index 77a306214a..75cfb6e7bb 100644 --- a/includes/MimeMagic.php +++ b/includes/MimeMagic.php @@ -455,71 +455,20 @@ class MimeMagic { /* * look for XML formats (XHTML and SVG) */ - $xml_type = NULL; - if ( substr( $head, 0, 5 ) == " 126 ) { - printf( "\\x%02x", $c ); - } else { - print $head{$i}; - } - } - echo "\n"; - */ - } - - if( $xml_type == 'UTF-16BE' || $xml_type == 'UTF-16LE' ) { - // Quick and dirty fold down to ASCII! - $pack = array( 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' ); - $chars = unpack( $pack[$xml_type], substr( $head, 2 ) ); - $head = ''; - foreach( $chars as $codepoint ) { - if( $codepoint < 128 ) { - $head .= chr( $codepoint ); - } else { - $head .= '?'; - } - } - } - - $match = array(); - $doctype = ""; - $tag = ""; - - if ( preg_match( '%%siD', - $head, $match ) ) { - $doctype = $match[1]; - } - - if( $xml_type || $doctype ) { - if ( preg_match( '%<(\w+)\b%si', $head, $match ) ) { - $tag = $match[1]; - } - - #print "
ANALYSING $file: doctype= $doctype; tag= $tag
"; - - if ( strpos( $doctype, "-//W3C//DTD SVG" ) === 0 ) { - return "image/svg+xml"; - } elseif ( $tag === "svg" ) { - return "image/svg+xml"; - } elseif ( strpos( $doctype, "-//W3C//DTD XHTML" ) === 0 ) { - return "text/html"; - } elseif ( $tag === "html" ) { - return "text/html"; + $xml = new XmlTypeCheck( $file ); + if( $xml->wellFormed ) { + $types = array( + 'http://www.w3.org/2000/svg:svg' => 'image/svg+xml', + 'svg' => 'image/svg+xml', + 'http://www.w3.org/1999/xhtml:html' => 'text/html', // application/xhtml+xml? + 'html' => 'text/html', // application/xhtml+xml? + ); + if( isset( $types[$xml->rootElement] ) ) { + $mime = $types[$xml->rootElement]; + return $mime; } else { /// Fixme -- this would be the place to allow additional XML type checks - return "application/xml"; + return 'application/xml'; } } @@ -541,7 +490,17 @@ class MimeMagic { if ( $script_type ) { if ( $script_type !== "UTF-8" && $script_type !== "ASCII") { - $head = iconv( $script_type, "ASCII//IGNORE", $head); + // Quick and dirty fold down to ASCII! + $pack = array( 'UTF-16BE' => 'n*', 'UTF-16LE' => 'v*' ); + $chars = unpack( $pack[$script_type], substr( $head, 2 ) ); + $head = ''; + foreach( $chars as $codepoint ) { + if( $codepoint < 128 ) { + $head .= chr( $codepoint ); + } else { + $head .= '?'; + } + } } $match = array(); diff --git a/includes/XmlTypeCheck.php b/includes/XmlTypeCheck.php new file mode 100644 index 0000000000..639d1f8531 --- /dev/null +++ b/includes/XmlTypeCheck.php @@ -0,0 +1,93 @@ +softNamespaces = $softNamespaces; + $this->run( $file ); + } + + private function run( $fname ) { + if( $this->softNamespaces ) { + $parser = xml_parser_create( 'UTF-8' ); + } else { + $parser = xml_parser_create_ns( 'UTF-8' ); + } + + // case folding violates XML standard, turn it off + xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false ); + + xml_set_element_handler( $parser, array( $this, 'elementOpen' ), false ); + + $file = fopen( $fname, "rb" ); + do { + $chunk = fread( $file, 32768 ); + $ret = xml_parse( $parser, $chunk, feof( $file ) ); + if( $ret == 0 ) { + // XML isn't well-formed! + fclose( $file ); + xml_parser_free( $parser ); + return; + } + } while( !feof( $file ) ); + + $this->wellFormed = true; + + fclose( $file ); + xml_parser_free( $parser ); + } + + private function elementOpen( $parser, $name, $attribs ) { + if( $this->softNamespaces ) { + // Check namespaces manually, so expat doesn't throw + // errors on use of undeclared namespaces. + foreach( $attribs as $attrib => $val ) { + if( $attrib == 'xmlns' ) { + $this->namespaces[''] = $val; + } elseif( substr( $attrib, 0, strlen( 'xmlns:' ) ) == 'xmlns:' ) { + $this->namespaces[substr( $attrib, strlen( 'xmlns:' ) )] = $val; + } + } + + if( strpos( $name, ':' ) === false ) { + $ns = ''; + $subname = $name; + } else { + list( $ns, $subname ) = explode( ':', $name, 2 ); + } + + if( isset( $this->namespaces[$ns] ) ) { + $name = $this->namespaces[$ns] . ':' . $subname; + } else { + // Technically this is invalid for XML with Namespaces. + // But..... we'll just let it slide in soft mode. + } + } + + // We only need the first open element + $this->rootElement = $name; + xml_set_element_handler( $parser, false, false ); + } +} -- 2.20.1