Merge "Include log id in api error response"

[lhc/web/wiklou.git] / includes / parser / Parser.php
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php

index 5c8253a..b7f8cf2 100644 (file)
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -90,6 +90,9 @@ class Parser {
         const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)([^][<>"\\x00-\\x20\\x7F\p{Zs}]+)
                 \\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu';
  
+       # Regular expression for a non-newline space
+       const SPACE_NOT_NL = '(?:\t|&nbsp;|&\#0*160;|&\#[Xx]0*[Aa]0;|\p{Zs})';
+
         # State constants for the definition list colon extraction
         const COLON_STATE_TEXT = 0;
         const COLON_STATE_TAG = 1;
@@ -143,7 +146,8 @@ class Parser {
          * @var MagicWordArray
          */
         public $mSubstWords;
-       public $mConf, $mPreprocessor, $mExtLinkBracketedRegex, $mUrlProtocols; # Initialised in constructor
+       # Initialised in constructor
+       public $mConf, $mPreprocessor, $mExtLinkBracketedRegex, $mUrlProtocols;
  
         # Cleared with clearState():
         /**
@@ -630,7 +634,9 @@ class Parser {
          * @param bool|PPFrame $frame
          * @return mixed|string
          */
-       public function preprocess( $text, Title $title = null, ParserOptions $options, $revid = null, $frame = false ) {
+       public function preprocess( $text, Title $title = null,
+               ParserOptions $options, $revid = null, $frame = false
+       ) {
                 wfProfileIn( __METHOD__ );
                 $magicScopeVariable = $this->lock();
                 $this->startParse( $title, $options, self::OT_PREPROCESS, true );
@@ -1389,18 +1395,22 @@ class Parser {
                 wfProfileIn( __METHOD__ );
                 $prots = wfUrlProtocolsWithoutProtRel();
                 $urlChar = self::EXT_LINK_URL_CLASS;
+               $space = self::SPACE_NOT_NL; #  non-newline space
+               $spdash = "(?:-|$space)"; # a dash or a non-newline space
+               $spaces = "$space++"; # possessive match of 1 or more spaces
                 $text = preg_replace_callback(
                         '!(?:                           # Start cases
                                 (<a[ \t\r\n>].*?</a>) |     # m[1]: Skip link text
                                 (<.*?>) |                   # m[2]: Skip stuff inside HTML elements' . "
-                               (\b(?i:$prots)$urlChar+) |  # m[3]: Free external links" . '
-                               \b(?:RFC|PMID)\s+([0-9]+)\b |# m[4]: RFC or PMID, capture number
-                               \bISBN\s+(                  # m[5]: ISBN, capture number
-                                       (?: 97[89] [\ \-]? )?   # optional 13-digit ISBN prefix
-                                       (?: [0-9]  [\ \-]? ){9} # 9 digits with opt. delimiters
+                               (\b(?i:$prots)$urlChar+) |  # m[3]: Free external links
+                               \b(?:RFC|PMID) $spaces      # m[4]: RFC or PMID, capture number
+                                       ([0-9]+)\b |
+                               \bISBN $spaces (            # m[5]: ISBN, capture number
+                                       (?: 97[89] $spdash? )?   # optional 13-digit ISBN prefix
+                                       (?: [0-9]  $spdash? ){9} # 9 digits with opt. delimiters
                                         [0-9Xx]                 # check digit
-                                       )\b
-                       )!xu', array( &$this, 'magicLinkCallback' ), $text );
+                               )\b
+                       )!xu", array( &$this, 'magicLinkCallback' ), $text );
                 wfProfileOut( __METHOD__ );
                 return $text;
         }
@@ -1441,6 +1451,8 @@ class Parser {
                 } elseif ( isset( $m[5] ) && $m[5] !== '' ) {
                         # ISBN
                         $isbn = $m[5];
+                       $space = self::SPACE_NOT_NL; #  non-newline space
+                       $isbn = preg_replace( "/$space/", ' ', $isbn );
                         $num = strtr( $isbn, array(
                                 '-' => '',
                                 ' ' => '',
@@ -1484,7 +1496,20 @@ class Parser {
                         $sep .= ')';
                 }
  
-               $numSepChars = strspn( strrev( $url ), $sep );
+               $urlRev = strrev( $url );
+               $numSepChars = strspn( $urlRev, $sep );
+               # Don't break a trailing HTML entity by moving the ; into $trail
+               # This is in hot code, so use substr_compare to avoid having to
+               # create a new string object for the comparison
+               if ( $numSepChars && substr_compare( $url, ";", -$numSepChars, 1 ) === 0) {
+                       # more optimization: instead of running preg_match with a $
+                       # anchor, which can be slow, do the match on the reversed
+                       # string starting at the desired offset.
+                       # un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i
+                       if ( preg_match( '/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars ) ) {
+                               $numSepChars--;
+                       }
+               }
                 if ( $numSepChars ) {
                         $trail = substr( $url, -$numSepChars ) . $trail;
                         $url = substr( $url, 0, -$numSepChars );
@@ -4614,14 +4639,15 @@ class Parser {
                         # * <sup> and <sub> (bug 8393)
                         # * <i> (bug 26375)
                         # * <b> (r105284)
+                       # * <bdi> (bug 72884)
                         # * <span dir="rtl"> and <span dir="ltr"> (bug 35167)
                         #
                         # We strip any parameter from accepted tags (second regex), except dir="rtl|ltr" from <span>,
                         # to allow setting directionality in toc items.
                         $tocline = preg_replace(
                                 array(
-                                       '#<(?!/?(span|sup|sub|i|b)(?: [^>]*)?>).*?' . '>#',
-                                       '#<(/?(?:span(?: dir="(?:rtl|ltr)")?|sup|sub|i|b))(?: .*?)?' . '>#'
+                                       '#<(?!/?(span|sup|sub|bdi|i|b)(?: [^>]*)?>).*?' . '>#',
+                                       '#<(/?(?:span(?: dir="(?:rtl|ltr)")?|sup|sub|bdi|i|b))(?: .*?)?' . '>#'
                                 ),
                                 array( '', '<$1>' ),
                                 $safeHeadline
@@ -4862,6 +4888,7 @@ class Parser {
  
                 $pairs = array(
                         "\r\n" => "\n",
+                       "\r" => "\n",
                 );
                 $text = str_replace( array_keys( $pairs ), array_values( $pairs ), $text );
                 if ( $options->getPreSaveTransform() ) {