- /* private */ function removeHTMLtags( $text )
- {
- wfProfileIn( "OutputPage::removeHTMLtags" );
- $htmlpairs = array( # Tags that must be closed
- "b", "i", "u", "font", "big", "small", "sub", "sup", "h1",
- "h2", "h3", "h4", "h5", "h6", "cite", "code", "em", "s",
- "strike", "strong", "tt", "var", "div", "center",
- "blockquote", "ol", "ul", "dl", "table", "caption", "pre",
- "ruby", "rt" , "rb" , "rp"
- );
- $htmlsingle = array(
- "br", "p", "hr", "li", "dt", "dd"
- );
- $htmlnest = array( # Tags that can be nested--??
- "table", "tr", "td", "th", "div", "blockquote", "ol", "ul",
- "dl", "font", "big", "small", "sub", "sup"
- );
- $tabletags = array( # Can only appear inside table
- "td", "th", "tr"
- );
-
- $htmlsingle = array_merge( $tabletags, $htmlsingle );
- $htmlelements = array_merge( $htmlsingle, $htmlpairs );
-
- $htmlattrs = array( # Allowed attributes--no scripting, etc.
- "title", "align", "lang", "dir", "width", "height",
- "bgcolor", "clear", /* BR */ "noshade", /* HR */
- "cite", /* BLOCKQUOTE, Q */ "size", "face", "color",
- /* FONT */ "type", "start", "value", "compact",
- /* For various lists, mostly deprecated but safe */
- "summary", "width", "border", "frame", "rules",
- "cellspacing", "cellpadding", "valign", "char",
- "charoff", "colgroup", "col", "span", "abbr", "axis",
- "headers", "scope", "rowspan", "colspan", /* Tables */
- "id", "class", "name", "style" /* For CSS */
- );
-
- # Remove HTML comments
- $text = preg_replace( "/<!--.*-->/sU", "", $text );
-
- $bits = explode( "<", $text );
- $text = array_shift( $bits );
- $tagstack = array(); $tablestack = array();
-
- foreach ( $bits as $x ) {
- $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
- preg_match( "/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/",
- $x, $regs );
- list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
- error_reporting( $prev );
-
- $badtag = 0 ;
- if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
- # Check our stack
- if ( $slash ) {
- # Closing a tag...
- if ( ! in_array( $t, $htmlsingle ) &&
- ( $ot = array_pop( $tagstack ) ) != $t ) {
- array_push( $tagstack, $ot );
- $badtag = 1;
- } else {
- if ( $t == "table" ) {
- $tagstack = array_pop( $tablestack );
- }
- $newparams = "";
- }
- } else {
- # Keep track for later
- if ( in_array( $t, $tabletags ) &&
- ! in_array( "table", $tagstack ) ) {
- $badtag = 1;
- } else if ( in_array( $t, $tagstack ) &&
- ! in_array ( $t , $htmlnest ) ) {
- $badtag = 1 ;
- } else if ( ! in_array( $t, $htmlsingle ) ) {
- if ( $t == "table" ) {
- array_push( $tablestack, $tagstack );
- $tagstack = array();
- }
- array_push( $tagstack, $t );
- }
- # Strip non-approved attributes from the tag
- $newparams = preg_replace(
- "/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e",
- "(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')",
- $params);
- }
- if ( ! $badtag ) {
- $rest = str_replace( ">", ">", $rest );
- $text .= "<$slash$t$newparams$brace$rest";
- continue;
- }
- }
- $text .= "<" . str_replace( ">", ">", $x);
- }
- # Close off any remaining tags
- while ( $t = array_pop( $tagstack ) ) {
- $text .= "</$t>\n";
- if ( $t == "table" ) { $tagstack = array_pop( $tablestack ); }