From 465e50e8e7b81a238dfee2d1c8daf82ba12b76b1 Mon Sep 17 00:00:00 2001 From: Tim Starling Date: Mon, 21 Feb 2011 14:17:50 +0000 Subject: [PATCH] =?utf8?q?Rewrote=20LanguageConverter::autoConvert()=20to?= =?utf8?q?=20make=20it=20use=20preg=5Fmatch()=20with=20an=20offset=20inste?= =?utf8?q?ad=20of=20preg=5Fsplit().=20Reduces=20memory=20usage=20for=20my?= =?utf8?q?=20test=20case=20([[=E5=8F=B0=E7=81=A3=E6=BC=94=E5=93=A1?= =?utf8?q?=E5=88=97=E8=A1=A8]])=20to=20a=20negligible=20amount.=20This=20s?= =?utf8?q?hould=20eliminate=20the=20most=20common=20cause=20of=20OOMs=20on?= =?utf8?q?=20Wikimedia.=20Produces=20the=20exact=20same=20output=20for=20t?= =?utf8?q?hat=20test=20case=20for=20the=20zh=20->=20zh-tw,=20parser=20test?= =?utf8?q?s=20pass,=20seems=20to=20work.?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- languages/LanguageConverter.php | 126 +++++++++++++++++--------------- 1 file changed, 66 insertions(+), 60 deletions(-) diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php index 9c923b910f..7170787244 100644 --- a/languages/LanguageConverter.php +++ b/languages/LanguageConverter.php @@ -299,34 +299,6 @@ class LanguageConverter { return $this->mHeaderVariant; } - /** - * Caption convert, base on preg_replace_callback. - * - * To convert text in "title" or "alt", like 'textgetPreferredVariant(); - $title = $matches[1]; - $text = $matches[2]; - - // we convert captions except URL - if ( !strpos( $text, '://' ) ) { - $text = $this->translate( $text, $toVariant ); - } - - // remove HTML tags to prevent disrupting the layout - $text = preg_replace( '/<[^>]+>/', '', $text ); - // escape HTML special chars to prevent disrupting the layout - $text = htmlspecialchars( $text ); - - return " {$title}=\"{$text}\""; - } - /** * Dictionary-based conversion. * This function would not parse the conversion rules. @@ -374,41 +346,75 @@ class LanguageConverter { $reg = '/' . $codefix . $scriptfix . $prefix . '<[^>]+>|&[a-zA-Z#][a-z0-9]+;' . $marker . $htmlfix . '/s'; + $startPos = 0; + $sourceBlob = ''; + $literalBlob = ''; + + // Guard against delimiter nulls in the input + $text = str_replace( "\000", '', $text ); + + while ( $startPos < strlen( $text ) ) { + if ( preg_match( $reg, $text, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) { + $elementPos = $markupMatches[0][1]; + $element = $markupMatches[0][0]; + } else { + $elementPos = strlen( $text ); + $element = ''; + } + + // Queue the part before the markup for translation in a batch + $sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000"; + + // Advance to the next position + $startPos = $elementPos + strlen( $element ); - $matches = preg_split( $reg, $text, - 1, PREG_SPLIT_OFFSET_CAPTURE ); - - $m = array_shift( $matches ); - - $ret = $this->translate( $m[0], $toVariant ); - $mstart = $m[1] + strlen( $m[0] ); - - // enable convertsion of 'xxxxtranslate( $trtext, $toVariant ); - $trtext = StringUtils::explode( $trtextmark, $trtext ); - foreach ( $trtext as $t ) { - $ret .= array_shift( $notrtext ); - $ret .= $t; + // Translate any alt or title attributes inside the matched element + if ( $element !== '' && preg_match( '/^(<[^>\s]*)\s([^>]*)(.*)$/', $element, + $elementMatches ) ) + { + $attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] ); + $changed = false; + foreach ( array( 'title', 'alt' ) as $attrName ) { + if ( !isset( $attrs[$attrName] ) ) { + continue; + } + $attr = $attrs[$attrName]; + // Don't convert URLs + if ( !strpos( $attr, '://' ) ) { + $attr = $this->translate( $attr, $toVariant ); + } + + // Remove HTML tags to avoid disrupting the layout + $attr = preg_replace( '/<[^>]+>/', '', $attr ); + if ( $attr !== $attrs[$attrName] ) { + $attrs[$attrName] = $attr; + $changed = true; + } + } + if ( $changed ) { + $element = $elementMatches[1] . Html::expandAttributes( $attrs ) . + $elementMatches[3]; + } + } + $literalBlob .= $element . "\000"; + } + + // Do the main translation batch + $translatedBlob = $this->translate( $sourceBlob, $toVariant ); + + // Put the output back together + $translatedIter = StringUtils::explode( "\000", $translatedBlob ); + $literalIter = StringUtils::explode( "\000", $literalBlob ); + $output = ''; + while ( $translatedIter->valid() && $literalIter->valid() ) { + $output .= $translatedIter->current(); + $output .= $literalIter->current(); + $translatedIter->next(); + $literalIter->next(); } + wfProfileOut( __METHOD__ ); - return $ret; + return $output; } /** -- 2.20.1