*/
private $parserOptions = [
'processing_instruction_handler' => '',
+ 'external_dtd_handler' => '',
+ 'dtd_handler' => '',
+ 'require_safe_dtd' => true
];
/**
+ * Allow filtering an XML file.
+ *
+ * Filters should return either true or a string to indicate something
+ * is wrong with the file. $this->filterMatch will store if the
+ * file failed validation (true = failed validation).
+ * $this->filterMatchType will contain the validation error.
+ * $this->wellFormed will contain whether the xml file is well-formed.
+ *
+ * @note If multiple filters are hit, only one of them will have the
+ * result stored in $this->filterMatchType.
+ *
* @param string $input a filename or string containing the XML element
* @param callable $filterCallback (optional)
* Function to call to do additional custom validity checks from the
* SAX element handler event. This gives you access to the element
* namespace, name, attributes, and text contents.
- * Filter should return 'true' to toggle on $this->filterMatch
+ * Filter should return a truthy value describing the error.
* @param bool $isFile (optional) indicates if the first parameter is a
* filename (default, true) or if it is a string (false)
* @param array $options list of additional parsing options:
* processing_instruction_handler: Callback for xml_set_processing_instruction_handler
+ * external_dtd_handler: Callback for the url of external dtd subset
+ * dtd_handler: Callback given the full text of the <!DOCTYPE declaration.
+ * require_safe_dtd: Only allow non-recursive entities in internal dtd (default true)
*/
function __construct( $input, $filterCallback = null, $isFile = true, $options = [] ) {
$this->filterCallback = $filterCallback;
if ( $reader->nodeType === XMLReader::PI ) {
$this->processingInstructionHandler( $reader->name, $reader->value );
}
+ if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
+ $this->DTDHandler( $reader );
+ }
} while ( $reader->nodeType != XMLReader::ELEMENT );
// Process the rest of the document
$reader->value
);
break;
+ case XMLReader::DOC_TYPE:
+ // We should never see a doctype after first
+ // element.
+ $this->wellFormed = false;
+ break;
default:
- // One of DOC, DOC_TYPE, ENTITY, END_ENTITY,
+ // One of DOC, ENTITY, END_ENTITY,
// NOTATION, or XML_DECLARATION
// xml_parse didn't send these to the filter, so we won't.
}
$this->filterMatchType = $callbackReturn;
}
}
+ /**
+ * Handle coming across a <!DOCTYPE declaration.
+ *
+ * @param XMLReader $reader Reader currently pointing at DOCTYPE node.
+ */
+ private function DTDHandler( XMLReader $reader ) {
+ $externalCallback = $this->parserOptions['external_dtd_handler'];
+ $generalCallback = $this->parserOptions['dtd_handler'];
+ $checkIfSafe = $this->parserOptions['require_safe_dtd'];
+ if ( !$externalCallback && !$generalCallback && !$checkIfSafe ) {
+ return;
+ }
+ $dtd = $reader->readOuterXML();
+ $callbackReturn = false;
+
+ if ( $generalCallback ) {
+ $callbackReturn = call_user_func( $generalCallback, $dtd );
+ }
+ if ( $callbackReturn ) {
+ // Filter hit!
+ $this->filterMatch = true;
+ $this->filterMatchType = $callbackReturn;
+ $callbackReturn = false;
+ }
+
+ $parsedDTD = $this->parseDTD( $dtd );
+ if ( $externalCallback && isset( $parsedDTD['type'] ) ) {
+ $callbackReturn = call_user_func(
+ $externalCallback,
+ $parsedDTD['type'],
+ isset( $parsedDTD['publicid'] ) ? $parsedDTD['publicid'] : null,
+ isset( $parsedDTD['systemid'] ) ? $parsedDTD['systemid'] : null
+ );
+ }
+ if ( $callbackReturn ) {
+ // Filter hit!
+ $this->filterMatch = true;
+ $this->filterMatchType = $callbackReturn;
+ $callbackReturn = false;
+ }
+
+ if ( $checkIfSafe && isset( $parsedDTD['internal'] ) ) {
+ if ( !$this->checkDTDIsSafe( $parsedDTD['internal'] ) ) {
+ $this->wellFormed = false;
+ }
+ }
+ }
+
+ /**
+ * Check if the internal subset of the DTD is safe.
+ *
+ * We whitelist an extremely restricted subset of DTD features.
+ *
+ * Safe is defined as:
+ * * Only contains entity defintions (e.g. No <!ATLIST )
+ * * Entity definitions are not "system" entities
+ * * Entity definitions are not "parameter" (i.e. %) entities
+ * * Entity definitions do not reference other entites except &
+ * and quotes. Entity aliases (where the entity contains only
+ * another entity are allowed)
+ * * Entity references aren't overly long (>255 bytes).
+ * * <!ATTLIST svg xmlns:xlink CDATA #FIXED "http://www.w3.org/1999/xlink">
+ * allowed if matched exactly for compatibility with graphviz
+ * * Comments.
+ *
+ * @param string $internalSubset The internal subset of the DTD
+ * @return bool true if safe.
+ */
+ private function checkDTDIsSafe( $internalSubset ) {
+ $offset = 0;
+ $res = preg_match(
+ '/^(?:\s*<!ENTITY\s+\S+\s+' .
+ '(?:"(?:&[^"%&;]{1,64};|(?:[^"%&]|&|"){0,255})"' .
+ '|\'(?:&[^"%&;]{1,64};|(?:[^\'%&]|&|'){0,255})\')\s*>' .
+ '|\s*<!--(?:[^-]|-[^-])*-->' .
+ '|\s*<!ATTLIST svg xmlns:xlink CDATA #FIXED ' .
+ '"http:\/\/www.w3.org\/1999\/xlink">)*\s*$/',
+ $internalSubset
+ );
+
+ return (bool)$res;
+ }
+
+ /**
+ * Parse DTD into parts.
+ *
+ * If there is an error parsing the dtd, sets wellFormed to false.
+ *
+ * @param $dtd string
+ * @return array Possibly containing keys publicid, systemid, type and internal.
+ */
+ private function parseDTD( $dtd ) {
+ $m = [];
+ $res = preg_match(
+ '/^<!DOCTYPE\s*\S+\s*' .
+ '(?:(?P<typepublic>PUBLIC)\s*' .
+ '(?:"(?P<pubquote>[^"]*)"|\'(?P<pubapos>[^\']*)\')' . // public identifer
+ '\s*"(?P<pubsysquote>[^"]*)"|\'(?P<pubsysapos>[^\']*)\'' . // system identifier
+ '|(?P<typesystem>SYSTEM)\s*' .
+ '(?:"(?P<sysquote>[^"]*)"|\'(?P<sysapos>[^\']*)\')' .
+ ')?\s*' .
+ '(?:\[\s*(?P<internal>.*)\])?\s*>$/s',
+ $dtd,
+ $m
+ );
+ if ( !$res ) {
+ $this->wellFormed = false;
+ return [];
+ }
+ $parsed = [];
+ foreach ( $m as $field => $value ) {
+ if ( $value === '' || is_numeric( $field ) ) {
+ continue;
+ }
+ switch ( $field ) {
+ case 'typepublic':
+ case 'typesystem':
+ $parsed['type'] = $value;
+ break;
+ case 'pubquote':
+ case 'pubapos':
+ $parsed['publicid'] = $value;
+ break;
+ case 'pubsysquote':
+ case 'pubsysapos':
+ case 'sysquote':
+ case 'sysapos':
+ $parsed['systemid'] = $value;
+ break;
+ case 'internal':
+ $parsed['internal'] = $value;
+ break;
+ }
+ }
+ return $parsed;
+ }
}