From f0555bab3d84621aa7a5776c98ae51d72e162291 Mon Sep 17 00:00:00 2001 From: Brian Wolff Date: Wed, 15 Nov 2017 05:34:10 +0000 Subject: [PATCH] Fix langauge converter parser test with self-close tags This fixes an issue in f21f3942 where if there was an html element with an alt or title attribute containing an < entity, an ascii EOT control character (0x04) may become inserted into the text if language converter was enabled. Due to a really old bug in language converter, self-closed tags got turned into non-self closed tags. However due a different bug which was fixed in f21f3942 this code path was rarely taken so nobody noticed until now. Follow-up Idbc45cac12 Bug: T180552 Change-Id: I077d30c50fcb419837fef937d27caca307153d2d --- languages/LanguageConverter.php | 18 ++++++++++++++++-- tests/parser/parserTests.txt | 2 +- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php index 8e98abd618..1d78402901 100644 --- a/languages/LanguageConverter.php +++ b/languages/LanguageConverter.php @@ -353,7 +353,6 @@ class LanguageConverter { if ( $this->guessVariant( $text, $toVariant ) ) { return $text; } - /* we convert everything except: 1. HTML markups (anything between < and >) 2. HTML entities @@ -389,6 +388,7 @@ class LanguageConverter { // Guard against delimiter nulls in the input // (should never happen: see T159174) $text = str_replace( "\000", '', $text ); + $text = str_replace( "\004", '', $text ); $markupMatches = null; $elementMatches = null; @@ -403,6 +403,13 @@ class LanguageConverter { // We hit the end. $elementPos = strlen( $text ); $element = ''; + } elseif( substr( $element, -1 ) === "\004" ) { + // This can sometimes happen if we have + // unclosed html tags (For example + // when converting a title attribute + // during a recursive call that contains + // a < e.g.
. + $element = substr( $element, 0, -1 ); } } else { // If we hit here, then Language Converter could be tricked @@ -430,7 +437,14 @@ class LanguageConverter { if ( $element !== '' && preg_match( '/^(<[^>\s]*+)\s([^>]*+)(.*+)$/', $element, $elementMatches ) ) { + // FIXME, this decodes entities, so if you have something + // like
the bar won't get + // translated since after entity decoding it looks like + // unclosed html and we call this method recursively + // on attributes. $attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] ); + // Ensure self-closing tags stay self-closing. + $close = substr( $elementMatches[2], -1 ) === '/' ? ' /' : ''; $changed = false; foreach ( [ 'title', 'alt' ] as $attrName ) { if ( !isset( $attrs[$attrName] ) ) { @@ -449,7 +463,7 @@ class LanguageConverter { } if ( $changed ) { $element = $elementMatches[1] . Html::expandAttributes( $attrs ) . - $elementMatches[3]; + $close . $elementMatches[3]; } } $literalBlob .= $element . "\000"; diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index 7e678d8146..56879ffc74 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -18515,7 +18515,7 @@ language=sr variant=sr-el [[File:Foobar.jpg|alt=-{}-foAjrjvi-{}-]] !! html

-

" onload="alert(1)" data-foo=" +

" onload="alert(1)" data-foo="

!! end -- 2.20.1