cb(), $subject, $flags ); } /** * More or less "markup-safe" explode() * Ignores any instances of the separator inside <...> * @param string $separator * @param string $text * @return array */ static function explodeMarkup( $separator, $text ) { $placeholder = "\x00"; // Remove placeholder instances $text = str_replace( $placeholder, '', $text ); // Replace instances of the separator inside HTML-like tags with the placeholder $replacer = new DoubleReplacer( $separator, $placeholder ); $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text ); // Explode, then put the replaced separators back in $items = explode( $separator, $cleaned ); foreach( $items as $i => $str ) { $items[$i] = str_replace( $placeholder, $separator, $str ); } return $items; } /** * Escape a string to make it suitable for inclusion in a preg_replace() * replacement parameter. * * @param string $string * @return string */ static function escapeRegexReplacement( $string ) { $string = str_replace( '\\', '\\\\', $string ); $string = str_replace( '$', '\\$', $string ); return $string; } /** * Workalike for explode() with limited memory usage. * Returns an Iterator */ static function explode( $separator, $subject ) { if ( substr_count( $subject, $separator ) > 1000 ) { return new ExplodeIterator( $separator, $subject ); } else { return new ArrayIterator( explode( $separator, $subject ) ); } } /** * Clean characters that are invalid in the given character set * from a given string. * * @param $string \type{$string} String to clean * @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding) * @return \type{$string} Cleaned string */ public static function cleanForCharset( $string, $charset='' ) { global $wgOutputEncoding; switch ( $charset ? $charset : $wgOutputEncoding ) { # UTF-8 should be all we need to worry about. :) case 'UTF-8': return self::cleanUtf8( $string ); default: return $string; } } /** * Clean invalid UTF-8 characters and sequences from a given string, * replacing them with U+FFFD. * Should be RFC 3629 compliant. * * @param $string \type{$string} String to clean * @return \type{$string} Cleaned string */ private static function cleanUtf8( $str ) { # HERE BE DRAGONS! # ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE. $illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF, 0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF ); $len = strlen( $str ); $left = $bytes = 0; for ( $i = 0; $i < $len; $i++ ) { $ch = ord( $str[$i] ); if ( !$left ) { if ( !($ch & 0x80 ) ) continue; $left = (( $ch & 0xFE ) == 0xFC ? 5 : (( $ch & 0xFC ) == 0xF8 ? 4 : (( $ch & 0xF8 ) == 0xF0 ? 3 : (( $ch & 0xF0 ) == 0xE0 ? 2 : (( $ch & 0xE0 ) == 0xC0 ? 1 : 0 ))))); if ( $left ) { $bytes = $left + 1; $sum = $ch & ( 0xFF >> $bytes + 1 ); continue; } else if ( $ch & 0x80 ) { $bytes = 1; } } else if ( ( $ch & 0xC0 ) == 0x80 ) { $sum <<= 6; $sum += $ch & 0x3F; if ( --$left ) continue; if ( ( $bytes == 2 && $sum < 0x80 ) || ( $bytes == 3 && $sum < 0x800 ) || ( $bytes == 4 && $sum < 0x10000 ) || ( $bytes > 4 || $sum > 0x10FFFF ) || in_array( $sum, $illegal ) ) { } else continue; } else { $bytes -= $left; $i--; } $str = ( substr( $str, 0, $i - $bytes + 1 ) . "\xEF\xBF\xBD" . substr( $str, $i + 1 ) ); $i += 3 - $bytes; $len += 3 - $bytes; $left = 0; } return $str; } } /** * Base class for "replacers", objects used in preg_replace_callback() and * StringUtils::delimiterReplaceCallback() */ class Replacer { function cb() { return array( &$this, 'replace' ); } } /** * Class to replace regex matches with a string similar to that used in preg_replace() */ class RegexlikeReplacer extends Replacer { var $r; function __construct( $r ) { $this->r = $r; } function replace( $matches ) { $pairs = array(); foreach ( $matches as $i => $match ) { $pairs["\$$i"] = $match; } return strtr( $this->r, $pairs ); } } /** * Class to perform secondary replacement within each replacement string */ class DoubleReplacer extends Replacer { function __construct( $from, $to, $index = 0 ) { $this->from = $from; $this->to = $to; $this->index = $index; } function replace( $matches ) { return str_replace( $this->from, $this->to, $matches[$this->index] ); } } /** * Class to perform replacement based on a simple hashtable lookup */ class HashtableReplacer extends Replacer { var $table, $index; function __construct( $table, $index = 0 ) { $this->table = $table; $this->index = $index; } function replace( $matches ) { return $this->table[$matches[$this->index]]; } } /** * Replacement array for FSS with fallback to strtr() * Supports lazy initialisation of FSS resource */ class ReplacementArray { /*mostly private*/ var $data = false; /*mostly private*/ var $fss = false; /** * Create an object with the specified replacement array * The array should have the same form as the replacement array for strtr() */ function __construct( $data = array() ) { $this->data = $data; } function __sleep() { return array( 'data' ); } function __wakeup() { $this->fss = false; } /** * Set the whole replacement array at once */ function setArray( $data ) { $this->data = $data; $this->fss = false; } function getArray() { return $this->data; } /** * Set an element of the replacement array */ function setPair( $from, $to ) { $this->data[$from] = $to; $this->fss = false; } function mergeArray( $data ) { $this->data = array_merge( $this->data, $data ); $this->fss = false; } function merge( $other ) { $this->data = array_merge( $this->data, $other->data ); $this->fss = false; } function removePair( $from ) { unset($this->data[$from]); $this->fss = false; } function removeArray( $data ) { foreach( $data as $from => $to ) $this->removePair( $from ); $this->fss = false; } function replace( $subject ) { if ( function_exists( 'fss_prep_replace' ) ) { wfProfileIn( __METHOD__.'-fss' ); if ( $this->fss === false ) { $this->fss = fss_prep_replace( $this->data ); } $result = fss_exec_replace( $this->fss, $subject ); wfProfileOut( __METHOD__.'-fss' ); } else { wfProfileIn( __METHOD__.'-strtr' ); $result = strtr( $subject, $this->data ); wfProfileOut( __METHOD__.'-strtr' ); } return $result; } } /** * An iterator which works exactly like: * * foreach ( explode( $delim, $s ) as $element ) { * ... * } * * Except it doesn't use 193 byte per element */ class ExplodeIterator implements Iterator { // The subject string var $subject, $subjectLength; // The delimiter var $delim, $delimLength; // The position of the start of the line var $curPos; // The position after the end of the next delimiter var $endPos; // The current token var $current; /** * Construct a DelimIterator */ function __construct( $delim, $s ) { $this->subject = $s; $this->delim = $delim; // Micro-optimisation (theoretical) $this->subjectLength = strlen( $s ); $this->delimLength = strlen( $delim ); $this->rewind(); } function rewind() { $this->curPos = 0; $this->endPos = strpos( $this->subject, $this->delim ); $this->refreshCurrent(); } function refreshCurrent() { if ( $this->curPos === false ) { $this->current = false; } elseif ( $this->curPos >= $this->subjectLength ) { $this->current = ''; } elseif ( $this->endPos === false ) { $this->current = substr( $this->subject, $this->curPos ); } else { $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos ); } } function current() { return $this->current; } function key() { return $this->curPos; } function next() { if ( $this->endPos === false ) { $this->curPos = false; } else { $this->curPos = $this->endPos + $this->delimLength; if ( $this->curPos >= $this->subjectLength ) { $this->endPos = false; } else { $this->endPos = strpos( $this->subject, $this->delim, $this->curPos ); } } $this->refreshCurrent(); return $this->current; } function valid() { return $this->curPos !== false; } }