Made strong/em handling more forgiving against unbalanced ticks

[lhc/web/wiklou.git] / includes / Parser.php
diff --git a/includes/Parser.php b/includes/Parser.php

index 28ffb86..05c6786 100644 (file)
--- a/includes/Parser.php
+++ b/includes/Parser.php
@@ -547,7 +547,7 @@ class Parser
                 return $t ;
         }
  
-       function internalParse( $text, $linestart, $args = array() )
+       function internalParse( $text, $linestart, $args = array(), $isMain=true )
         {
                 $fname = "Parser::internalParse";
                 wfProfileIn( $fname );
@@ -565,7 +565,7 @@ class Parser
                 $text = $this->replaceExternalLinks( $text );
                 $text = $this->doTokenizedParser ( $text );
                 $text = $this->doTableStuff ( $text ) ;
-               $text = $this->formatHeadings( $text );
+               $text = $this->formatHeadings( $text, $isMain );
                 $sk =& $this->mOptions->getSkin();
                 $text = $sk->transformContent( $text );
  
@@ -675,6 +675,21 @@ class Parser
                 return $s;
         }
  
+       /* private */ function handle4Quotes( &$state, $token )
+       {
+               /* This one makes some assumptions. 
+                * '''Caesar''''s army  => <strong>Caesar</strong>'s army
+                * ''''Caesar'''' was a roman emperor => '<strong>Caesar</strong>' was a roman emperor
+                * These assumptions might be wrong, but any other assumption might be wrong, too.
+                * So here we go */
+               if ( $state["strong"] !== false ) {
+                       return $this->handle3Quotes( $state, $token ) . "'";
+               } else {
+                       return "'" . $this->handle3Quotes( $state, $token );
+               }
+       }
+
+
         /* private */ function handle3Quotes( &$state, $token )
         {
                 if ( $state["strong"] !== false ) {
@@ -688,7 +703,7 @@ class Parser
                         $state["strong"] = FALSE;
                 } else {
                         $s = "<strong>";
-                       $state["strong"] = isset($token["pos"]) ? $token["pos"] : true;
+                       $state["strong"] = $token["pos"];
                 }
                 return $s;
         }
@@ -706,7 +721,7 @@ class Parser
                         $state["em"] = FALSE;
                 } else {
                         $s = "<em>";
-                       $state["em"] = isset($token["pos"]) ? $token["pos"] : true;
+                       $state["em"] = $token["pos"];
  
                 }
                 return $s;
@@ -732,7 +747,7 @@ class Parser
                         $state["em"] = $token["pos"];
                 } else { # not $em and not $strong
                         $s .= "<strong><em>";
-                       $state["strong"] = $state["em"] = isset($token["pos"]) ? $token["pos"] : true;
+                       $state["strong"] = $state["em"] = $token["pos"];
                 }
                 return $s;
         }
@@ -830,7 +845,7 @@ class Parser
                                         $txt = "\n<hr />\n";
                                         break;
                                 case "'''":
-                                       # This and the three next ones handle quotes
+                                       # This and the four next ones handle quotes
                                         $txt = $this->handle3Quotes( $state, $token );
                                         break;
                                 case "''":
@@ -839,10 +854,26 @@ class Parser
                                 case "'''''":
                                         $txt = $this->handle5Quotes( $state, $token );
                                         break;
+                               case "''''":
+                                       $txt = $this->handle4Quotes( $state, $token );
+                                       break;
                                 case "":
                                         # empty token
                                         $txt="";
                                         break;
+                               case "h": 
+                                       #heading- used to close all unbalanced bold or em tags in this section
+                                       $txt = '';
+                                       if( $state['em'] !== false and 
+                                       ( $state['strong'] === false or $state['em'] > $state['strong'] ) )
+                                       { 
+                                               $s .= '</em>';
+                                               $state['em'] = false;
+                                       }
+                                       if ( $state['strong'] !== false ) $txt .= '</strong>';
+                                       if ( $state['em'] !== false ) $txt .= '</em>';
+                                       $state['strong'] = $state['em'] = false;
+                                       break;
                                 case "RFC ":
                                         if ( $tagIsOpen ) {
                                                 $txt = "RFC ";
@@ -886,6 +917,19 @@ class Parser
                                 $s .= $txt;
                         }
                 } #end while
+
+               # make 100% sure all strong and em tags are closed
+               # doBlockLevels often messes the last bit up though, but invalid nesting is better than unclosed tags
+               # tidy solves this though
+               if( $state['em'] !== false and 
+               ( $state['strong'] === false or $state['em'] > $state['strong'] ) )
+               { 
+                       $s .= '</em>';
+                       $state['em'] = false;
+               }
+               if ( $state['strong'] !== false ) $s .= '</strong>';
+               if ( $state['em'] !== false ) $s .= '</em>';
+
                 if ( count( $tokenStack ) != 0 )
                 {
                         # still objects on stack. opened [[ tag without closing ]] tag.
@@ -1507,8 +1551,7 @@ class Parser
  
                         # Run full parser on the included text
                         $text = $this->strip( $text, $this->mStripState );
-                       $text = $this->internalParse( $text, (bool)$newline, $assocArgs );
-                       if(!empty($newline)) $text = "\n".$text;
+                       $text = $this->internalParse( $text, (bool)$newline, $assocArgs, false );
  
                         # Add the result to the strip state for re-inclusion after
                         # the rest of the processing
@@ -1673,12 +1716,10 @@ class Parser
   *
   */
  
-       /* private */ function formatHeadings( $text )
+       /* private */ function formatHeadings( $text, $isMain=true )
         {
-               global $wgInputEncoding,$wgRequest,$wgOut;
+               global $wgInputEncoding;
                 
-               $startsection=$wgRequest->getVal('section');    
-               if($startsection) { $startsection--;}
                 $doNumberHeadings = $this->mOptions->getNumberHeadings();
                 $doShowToc = $this->mOptions->getShowToc();
                 if( !$this->mTitle->userCanEdit() ) {
@@ -1810,12 +1851,12 @@ class Parser
                                 if ( empty( $head[$headlineCount] ) ) {
                                         $head[$headlineCount] = "";
                                 }
-                               $head[$headlineCount] .= $sk->editSectionLink($startsection+$headlineCount+1);
+                               $head[$headlineCount] .= $sk->editSectionLink($headlineCount+1);
                         }
  
                         # Add the edit section span
                         if( $rightClickHack ) {
-                               $headline = $sk->editSectionScript($startsection+$headlineCount+1,$headline);
+                               $headline = $sk->editSectionScript($headlineCount+1,$headline);
                         }
  
                         # give headline the correct <h#> tag
@@ -1845,19 +1886,10 @@ class Parser
                                 # $full .= $sk->editSectionLink(0);
                         }
                         $full .= $block;
-                       if( $doShowToc && !$i) {
+                       if( $doShowToc && !$i && $isMain) {
                         # Top anchor now in skin
                                 $full = $full.$toc;
                         }
-                       
-                       # If a page is viewed in collapsed mode, a TOC generated
-                       # from the wikisource is stored in the title object.
-                       # This TOC is now fetched and inserted here if it exists.
-                       $collapsedtoc=$wgOut->getToc();
-                       if ($collapsedtoc && !$i) {
-                               $full = $full.$collapsedtoc;            
-                       }
-                       $wgOut->setToc("");
  
                         if( !empty( $head[$i] ) ) {
                                 $full .= $head[$i];
@@ -1868,115 +1900,6 @@ class Parser
                 return $full;
         }
  
-       /* Generates a HTML-formatted table of contents which links to individual sections 
-          from the wikisource. Used for collapsing long pages.
-        */        
-       /* static */ function getTocFromSource( $text ) {               
-               
-               global $wgUser,$wgInputEncoding,$wgTitle,$wgOut,$wgParser;              
-               $sk = $wgUser->getSkin();               
-               
-               $striparray=array();
-               $oldtype=$wgParser->mOutputType;
-               $wgParser->mOutputType=OT_WIKI;
-               $text=$wgParser->strip($text, $striparray, true);
-               $wgParser->mOutputType=$oldtype;        
-               
-               $numMatches = preg_match_all( "/^(=+)(.*?)=+|^<h([1-6]).*?>(.*?)<\/h[1-6].*?>/mi",$text,$matches);
-               
-               # no headings: text cannot be collapsed
-               if( $numMatches == 0 ) {
-                       return "";
-               }
-               
-               # We combine the headlines into a bundle and convert them to HTML
-               # in order to make stripping out the wikicrap easier.
-               $combined=implode("!@@@!",$matches[2]);
-               $myout=$wgParser->parse($combined,$wgTitle,$wgOut->mParserOptions);                     
-               $combined_html=$myout->getText();               
-               $headlines=array();
-               $headlines=explode("!@@@!",$combined_html);
-               
-               # headline counter
-               $headlineCount = 0;             
-               $toclevel = 0;
-               $toc = "";
-               $full = "";
-               $head = array();
-               $sublevelCount = array();
-               $level = 0;
-               $prevlevel = 0;
-               foreach( $headlines as $headline ) {                    
-                       $headline=trim($headline);
-                       $numbering = "";
-                       if( $level ) {
-                               $prevlevel = $level;
-                       }
-                       $level = $matches[1][$headlineCount];
-                       
-                       # wikisource headings need to be converted into numbers
-                       # =foo= equals <h1>foo</h1>, ==foo== equals <h2>foo</h2> etc.
-                       if(strpos($level,"=")!==false) {
-                               $level=strlen($level);                  
-                       }
-                       
-                       if(  $prevlevel && $level > $prevlevel ) {
-                               # reset when we enter a new level
-                               $sublevelCount[$level] = 0;
-                               $toc .= $sk->tocIndent( $level - $prevlevel );
-                               $toclevel += $level - $prevlevel;
-                       }
-                       if( $level < $prevlevel ) {
-                               # reset when we step back a level
-                               $sublevelCount[$level+1]=0;
-                               $toc .= $sk->tocUnindent( $prevlevel - $level );
-                               $toclevel -= $prevlevel - $level;
-                       }
-                       # count number of headlines for each level
-                       @$sublevelCount[$level]++;                      
-                       $dot = 0;
-                       for( $i = 1; $i <= $level; $i++ ) {
-                               if( !empty( $sublevelCount[$i] ) ) {
-                                       if( $dot ) {
-                                               $numbering .= ".";
-                                       }
-                                       $numbering .= $sublevelCount[$i];
-                                       $dot = 1;
-                               }
-                       }
-                       
-
-                       # The canonized header is a version of the header text safe to use for links
-                       # Avoid insertion of weird stuff like <math> by expanding the relevant sections
-                       $state=array();
-                       $canonized_headline = Parser::unstrip( $headline, $state);                      
-                       
-                       # strip out HTML
-                       $canonized_headline = preg_replace( "/<.*?" . ">/","",$canonized_headline );
-                       $tocline = trim( $canonized_headline );
-                       $canonized_headline = preg_replace("/[ \\?&\\/<>\\(\\)\\[\\]=,+']+/", '_', urlencode( do_html_entity_decode( $tocline, ENT_COMPAT, $wgInputEncoding ) ) );
-                       $refer[$headlineCount] = $canonized_headline;
-
-                       # count how many in assoc. array so we can track dupes in anchors
-                       @$refers[$canonized_headline]++;
-                       $refcount[$headlineCount]=$refers[$canonized_headline];
-                       $tocline = $numbering . " " . $tocline;
-
-                       # Create the anchor for linking from the TOC to the section
-                       $anchor = trim($canonized_headline);
-                       
-                       if($refcount[$headlineCount] > 1 ) {
-                               $anchor .= "_" . $refcount[$headlineCount];
-                       }                       
-                       $headlineCount++;
-                       $toc .= $sk->tocLine($anchor,$tocline,$toclevel,$headlineCount);
-               }
-               $toclines = $headlineCount;
-               $toc .= $sk->tocUnindent( $toclevel );
-               $toc = $sk->tocTable( $toc );
-               return $toc;
-       
-       }
         /* private */ function doMagicISBN( &$tokenizer )
         {
                 global $wgLang;