}
}
- if ( $attribute === 'id' )
- $value = Sanitizer::escapeId( $value );
+ if ( $attribute === 'id' ) {
+ global $wgEnforceHtmlIds;
+ $value = Sanitizer::escapeId( $value,
+ $wgEnforceHtmlIds ? array() : 'xml' );
+ }
// If this attribute was previously set, override it.
// Output should only have one attribute of each name.
* @param string $id Id to validate
* @param mixed $options String or array of strings (default is array()):
* 'noninitial': This is a non-initial fragment of an id, not a full id,
- * so don't prepend an 'x' if the first character isn't valid at the
+ * so don't pay attention if the first character isn't valid at the
* beginning of an id.
+ * 'xml': Don't restrict the id to be HTML4-compatible. This option
+ * allows any alphabetic character to be used, per the XML standard.
+ * Therefore, it also completely changes the type of escaping: instead
+ * of weird dot-encoding, runs of invalid characters (mostly
+ * whitespace) are just compressed into a single underscore.
* @return string
*/
static function escapeId( $id, $options = array() ) {
$options = (array)$options;
- static $replace = array(
- '%3A' => ':',
- '%' => '.'
- );
-
- $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
- $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
-
- if( preg_match( '/[^a-zA-Z]/', $id[0] )
- && !in_array( 'noninitial', $options ) ) {
- // Initial character must be a letter!
- $id = "x$id";
+
+ if ( !in_array( 'xml', $options ) ) {
+ # HTML4-style escaping
+ static $replace = array(
+ '%3A' => ':',
+ '%' => '.'
+ );
+
+ $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+ $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
+
+ if ( preg_match( '/[^a-zA-Z]/', $id[0] )
+ && !in_array( 'noninitial', $options ) ) {
+ // Initial character must be a letter!
+ $id = "x$id";
+ }
+ return $id;
+ }
+
+ # XML-style escaping. For the patterns used, see the XML 1.0 standard,
+ # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
+ $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
+ . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
+ . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
+ $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
+ . '\x{203F}-\x{2040}';
+ # Replace _ as well so we don't get multiple consecutive underscores
+ $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
+ $id = trim( $id, '_' );
+
+ if ( !preg_match( "/^[$nameStartChar]/u", $id )
+ && !in_array( 'noninitial', $options ) ) {
+ $id = "_$id";
}
+
return $id;
}
* @private
*/
function formatHeadings( $text, $isMain=true ) {
- global $wgMaxTocLevel, $wgContLang;
+ global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds;
$doNumberHeadings = $this->mOptions->getNumberHeadings();
$showEditLink = $this->mOptions->getEditSection();
# Save headline for section edit hint before it's escaped
$headlineHint = $safeHeadline;
- $safeHeadline = Sanitizer::escapeId( $safeHeadline );
+ $safeHeadline = Sanitizer::escapeId( $safeHeadline,
+ $wgEnforceHtmlIds ? array() : 'xml' );
# HTML names must be case-insensitively unique (bug 10721)
$arrayKey = strtolower( $safeHeadline );