From 11093c73a13ab70adaf9fe4a75ff285f0305e804 Mon Sep 17 00:00:00 2001 From: Arne Heizmann Date: Fri, 6 Aug 2004 20:47:21 +0000 Subject: [PATCH] Somewhat less hacky fix to the French l''''homme''' problem. --- includes/Parser.php | 197 +++++++++++++++++++++++++++++++++----------- 1 file changed, 151 insertions(+), 46 deletions(-) diff --git a/includes/Parser.php b/includes/Parser.php index 37d0a93cb1..d3fe58dd85 100644 --- a/includes/Parser.php +++ b/includes/Parser.php @@ -822,9 +822,6 @@ class Parser $fname = 'Parser::internalParse'; wfProfileIn( $fname ); - global $fixLbug ; - if ( $fixLbug ) $text = preg_replace ( '/(l|L)\'/' , '\\1'' , $text ) ; - $text = $this->removeHTMLtags( $text ); $text = $this->replaceVariables( $text, $args ); @@ -836,12 +833,10 @@ class Parser $text = $wgDateFormatter->reformat( $this->mOptions->getDateFormat(), $text ); } $text = $this->doAllQuotes( $text ); - // $text = $this->doExponent( $text ); $text = $this->replaceExternalLinks( $text ); $text = $this->doMagicLinks( $text ); $text = $this->replaceInternalLinks ( $text ); $text = $this->replaceInternalLinks ( $text ); - //$text = $this->doTokenizedParser ( $text ); $text = $this->doTableStuff( $text ); $text = $this->formatHeadings( $text, $isMain ); $sk =& $this->mOptions->getSkin(); @@ -892,57 +887,167 @@ class Parser $outtext = ''; $lines = explode( "\n", $text ); foreach ( $lines as $line ) { - $outtext .= $this->doQuotes ( '', $line, '' ) . "\n"; + $outtext .= $this->doQuotes ( $line ) . "\n"; } $outtext = substr($outtext, 0,-1); wfProfileOut( $fname ); return $outtext; } - /* private */ function doQuotes( $pre, $text, $mode ) { - if ( preg_match( "/^(.*)''(.*)$/sU", $text, $m ) ) { - $m1_strong = ($m[1] == "") ? "" : "{$m[1]}"; - $m1_em = ($m[1] == "") ? "" : "{$m[1]}"; - if ( substr ($m[2], 0, 1) == '\'' ) { - $m[2] = substr ($m[2], 1); - if ($mode == 'em') { - return $this->doQuotes ( $m[1], $m[2], ($m[1] == '') ? 'both' : 'emstrong' ); - } else if ($mode == 'strong') { - return $m1_strong . $this->doQuotes ( '', $m[2], '' ); - } else if (($mode == 'emstrong') || ($mode == 'both')) { - return $this->doQuotes ( '', $pre.$m1_strong.$m[2], 'em' ); - } else if ($mode == 'strongem') { - return "{$pre}{$m1_em}" . $this->doQuotes ( '', $m[2], 'em' ); - } else { - return $m[1] . $this->doQuotes ( '', $m[2], 'strong' ); + /* private */ function doQuotes( $text ) { + $arr = preg_split ("/(''+)/", $text, -1, PREG_SPLIT_DELIM_CAPTURE); + if (count ($arr) == 1) + return $text; + else + { + $i = 0; + foreach ($arr as $r) + { + if (($i % 2) == 1) + { + # If there are ever four apostrophes, assume the first is supposed to + # be text, and the remaining three constitute mark-up for bold text. + if (strlen ($arr[$i]) == 4) + { + $arr[$i-1] .= "'"; + $arr[$i] = "'''"; + } + # If there are more than 5 apostrophes in a row, assume they're all + # text except for the last 5. + else if (strlen ($arr[$i]) > 5) + { + $arr[$i-1] .= str_repeat ("'", strlen ($arr[$i]) - 5); + $arr[$i] = "'''''"; + } + } - } else { - if ($mode == 'strong') { - return $this->doQuotes ( $m[1], $m[2], ($m[1] == '') ? 'both' : 'strongem' ); - } else if ($mode == 'em') { - return $m1_em . $this->doQuotes ( '', $m[2], '' ); - } else if ($mode == 'emstrong') { - return "{$pre}{$m1_strong}" . $this->doQuotes ( '', $m[2], 'strong' ); - } else if (($mode == 'strongem') || ($mode == 'both')) { - return $this->doQuotes ( '', $pre.$m1_em.$m[2], 'strong' ); - } else { - return $m[1] . $this->doQuotes ( '', $m[2], 'em' ); + $i++; + } + + # Now see if there's an odd or even number of "bold" and "italic" + # mark-up. There should normally be an even number of both. + $i = 0; + $numbold = 0; + $numitalics = 0; + foreach ($arr as $r) + { + if (($i % 2) == 1) + { + if (strlen ($r) == 2) $numitalics++; else + if (strlen ($r) == 3) $numbold++; else + if (strlen ($r) == 5) { $numitalics++; $numbold++; } } + $i++; } - } else { - $text_strong = ($text == '') ? '' : "{$text}"; - $text_em = ($text == '') ? '' : "{$text}"; - if ($mode == '') { - return $pre . $text; - } else if ($mode == 'em') { - return $pre . $text_em; - } else if ($mode == 'strong') { - return $pre . $text_strong; - } else if ($mode == 'strongem') { - return (($pre == '') && ($text == '')) ? '' : "{$pre}{$text_em}"; - } else { - return (($pre == '') && ($text == '')) ? '' : "{$pre}{$text_strong}"; + + # If there is an odd number of both bold and italics, it is likely + # that one of the bold ones was meant to be an apostrophe followed + # by italics. Which one we cannot know for certain, but it is more + # likely to be one that has a single-letter word before it. + if (($numbold % 2 == 1) && ($numitalics % 2 == 1)) + { + $i = 0; + $firstsingleletterword = -1; + $firstmultiletterword = -1; + $firstspace = -1; + foreach ($arr as $r) + { + if (($i % 2 == 1) and (strlen ($r) == 3)) + { + $x1 = substr ($arr[$i-1], -1); + $x2 = substr ($arr[$i-1], -2, 1); + if ($x1 == " ") { + if ($firstspace == -1) $firstspace = $i; + } else if ($x2 == " ") { + if ($firstsingleletterword == -1) $firstsingleletterword = $i; + } else { + if ($firstmultiletterword == -1) $firstmultiletterword = $i; + } + } + $i++; + } + + # If there is a single-letter word, use it! + if ($firstsingleletterword > -1) + { + $arr [ $firstsingleletterword ] = "''"; + $arr [ $firstsingleletterword-1 ] .= "'"; + } + # If not, but there's a multi-letter word, use that one. + else if ($firstmultiletterword > -1) + { + $arr [ $firstmultiletterword ] = "''"; + $arr [ $firstmultiletterword-1 ] .= "'"; + } + # ... otherwise use the first one that has neither. + else + { + $arr [ $firstspace ] = "''"; + $arr [ $firstspace-1 ] .= "'"; + } + } + + # Now let's actually convert our apostrophic mush to HTML! + $output = ''; + $buffer = ''; + $state = ''; + $i = 0; + foreach ($arr as $r) + { + if (($i % 2) == 0) + { + if ($state == 'both') + $buffer .= $r; + else + $output .= $r; + } + else + { + if (strlen ($r) == 2) + { + if ($state == 'em') + { $output .= ""; $state = ''; } + else if ($state == 'strongem') + { $output .= ""; $state = 'strong'; } + else if ($state == 'emstrong') + { $output .= ""; $state = 'strong'; } + else if ($state == 'both') + { $output .= "{$buffer}"; $state = 'strong'; } + else # $state can be 'strong' or '' + { $output .= ""; $state .= 'em'; } + } + else if (strlen ($r) == 3) + { + if ($state == 'strong') + { $output .= ""; $state = ''; } + else if ($state == 'strongem') + { $output .= ""; $state = 'em'; } + else if ($state == 'emstrong') + { $output .= ""; $state = 'em'; } + else if ($state == 'both') + { $output .= "{$buffer}"; $state = 'em'; } + else # $state can be 'em' or '' + { $output .= ""; $state .= 'strong'; } + } + else if (strlen ($r) == 5) + { + if ($state == 'strong') + { $output .= ""; $state = 'em'; } + else if ($state == 'em') + { $output .= ""; $state = 'strong'; } + else if ($state == 'strongem') + { $output .= ""; $state = ''; } + else if ($state == 'emstrong') + { $output .= ""; $state = ''; } + else if ($state == 'both') + { $output .= "{$buffer}"; $state = ''; } + else # ($state == '') + { $buffer = ''; $state = 'both'; } + } + } + $i++; } + return $output; } } -- 2.20.1