Merge "Improve MediaWikiVersionFetcherTest"
[lhc/web/wiklou.git] / includes / libs / JavaScriptMinifier.php
1 <?php
2 /**
3 * JavaScript Minifier
4 *
5 * @file
6 * @author Paul Copperman <paul.copperman@gmail.com>
7 * @license Apache-2.0
8 * @license MIT
9 * @license GPL-2.0-or-later
10 * @license LGPL-2.1-or-later
11 */
12
13 /**
14 * This class is meant to safely minify javascript code, while leaving syntactically correct
15 * programs intact. Other libraries, such as JSMin require a certain coding style to work
16 * correctly. OTOH, libraries like jsminplus, that do parse the code correctly are rather
17 * slow, because they construct a complete parse tree before outputting the code minified.
18 * So this class is meant to allow arbitrary (but syntactically correct) input, while being
19 * fast enough to be used for on-the-fly minifying.
20 *
21 * This class was written with ECMA-262 Edition 3 in mind ("ECMAScript 3"). Parsing features
22 * new to ECMAScript 5 or later might not be supported. However, Edition 5.1 better reflects
23 * how actual JS engines worked and work and is simpler and more readable prose. As such,
24 * the below code will refer to sections of the 5.1 specification.
25 *
26 * See <https://www.ecma-international.org/ecma-262/5.1/>.
27 */
28 class JavaScriptMinifier {
29
30 /* Parsing states.
31 * The state machine is only necessary to decide whether to parse a slash as division
32 * operator or as regexp literal.
33 * States are named after the next expected item. We only distinguish states when the
34 * distinction is relevant for our purpose.
35 */
36 const STATEMENT = 0;
37 const CONDITION = 1;
38 const PROPERTY_ASSIGNMENT = 2;
39 const EXPRESSION = 3;
40 const EXPRESSION_NO_NL = 4; // only relevant for semicolon insertion
41 const EXPRESSION_OP = 5;
42 const EXPRESSION_FUNC = 6;
43 const EXPRESSION_TERNARY = 7; // used to determine the role of a colon
44 const EXPRESSION_TERNARY_OP = 8;
45 const EXPRESSION_TERNARY_FUNC = 9;
46 const PAREN_EXPRESSION = 10; // expression which is not on the top level
47 const PAREN_EXPRESSION_OP = 11;
48 const PAREN_EXPRESSION_FUNC = 12;
49 const PROPERTY_EXPRESSION = 13; // expression which is within an object literal
50 const PROPERTY_EXPRESSION_OP = 14;
51 const PROPERTY_EXPRESSION_FUNC = 15;
52
53 /* Token types */
54 const TYPE_UN_OP = 101; // unary operators
55 const TYPE_INCR_OP = 102; // ++ and --
56 const TYPE_BIN_OP = 103; // binary operators
57 const TYPE_ADD_OP = 104; // + and - which can be either unary or binary ops
58 const TYPE_HOOK = 105; // ?
59 const TYPE_COLON = 106; // :
60 const TYPE_COMMA = 107; // ,
61 const TYPE_SEMICOLON = 108; // ;
62 const TYPE_BRACE_OPEN = 109; // {
63 const TYPE_BRACE_CLOSE = 110; // }
64 const TYPE_PAREN_OPEN = 111; // ( and [
65 const TYPE_PAREN_CLOSE = 112; // ) and ]
66 const TYPE_RETURN = 113; // keywords: break, continue, return, throw
67 const TYPE_IF = 114; // keywords: catch, for, with, switch, while, if
68 const TYPE_DO = 115; // keywords: case, var, finally, else, do, try
69 const TYPE_FUNC = 116; // keywords: function
70 const TYPE_LITERAL = 117; // all literals, identifiers and unrecognised tokens
71
72 // Sanity limit to avoid excessive memory usage
73 const STACK_LIMIT = 1000;
74
75 /**
76 * NOTE: This isn't a strict maximum. Longer lines will be produced when
77 * literals (e.g. quoted strings) longer than this are encountered
78 * or when required to guard against semicolon insertion.
79 */
80 const MAX_LINE_LENGTH = 1000;
81
82 /**
83 * Returns minified JavaScript code.
84 *
85 * @param string $s JavaScript code to minify
86 * @return String Minified code
87 */
88 public static function minify( $s ) {
89 // First we declare a few tables that contain our parsing rules
90
91 // $opChars : Characters which can be combined without whitespace between them.
92 $opChars = [
93 // ECMAScript 5.1 § 7.7 Punctuators
94 // Unlike the spec, these are individual symbols, not sequences.
95 '{' => true,
96 '}' => true,
97 '(' => true,
98 ')' => true,
99 '[' => true,
100 ']' => true,
101 '.' => true,
102 ';' => true,
103 ',' => true,
104 '<' => true,
105 '>' => true,
106 '=' => true,
107 '!' => true,
108 '+' => true,
109 '-' => true,
110 '*' => true,
111 '%' => true,
112 '&' => true,
113 '|' => true,
114 '^' => true,
115 '~' => true,
116 '?' => true,
117 ':' => true,
118 '/' => true,
119 // ECMAScript 5.1 § 7.8.4 String Literals
120 '"' => true,
121 "'" => true,
122 ];
123
124 // $tokenTypes : Map keywords and operators to their corresponding token type
125 $tokenTypes = [
126 // ECMAScript 5.1 § 11.4 Unary Operators
127 // ECMAScript 5.1 § 11.6 Additive Operators
128 // UnaryExpression includes PostfixExpression, which includes 'new'.
129 'new' => self::TYPE_UN_OP,
130 'delete' => self::TYPE_UN_OP,
131 'void' => self::TYPE_UN_OP,
132 'typeof' => self::TYPE_UN_OP,
133 '++' => self::TYPE_INCR_OP,
134 '--' => self::TYPE_INCR_OP,
135 '+' => self::TYPE_ADD_OP,
136 '-' => self::TYPE_ADD_OP,
137 '~' => self::TYPE_UN_OP,
138 '!' => self::TYPE_UN_OP,
139 // ECMAScript 5.1 § 11.5 Multiplicative Operators
140 '*' => self::TYPE_BIN_OP,
141 '/' => self::TYPE_BIN_OP,
142 '%' => self::TYPE_BIN_OP,
143 // ECMAScript 5.1 § 11.7 Bitwise Shift Operators
144 '<<' => self::TYPE_BIN_OP,
145 '>>' => self::TYPE_BIN_OP,
146 '>>>' => self::TYPE_BIN_OP,
147 // ECMAScript 5.1 § 11.8 Relational Operators
148 '<' => self::TYPE_BIN_OP,
149 '>' => self::TYPE_BIN_OP,
150 '<=' => self::TYPE_BIN_OP,
151 '>=' => self::TYPE_BIN_OP,
152 // ECMAScript 5.1 § 11.9 Equality Operators
153 '==' => self::TYPE_BIN_OP,
154 '!=' => self::TYPE_BIN_OP,
155 '===' => self::TYPE_BIN_OP,
156 '!==' => self::TYPE_BIN_OP,
157 'instanceof' => self::TYPE_BIN_OP,
158 'in' => self::TYPE_BIN_OP,
159 // ECMAScript 5.1 § 11.10 Binary Bitwise Operators
160 '&' => self::TYPE_BIN_OP,
161 '^' => self::TYPE_BIN_OP,
162 '|' => self::TYPE_BIN_OP,
163 // ECMAScript 5.1 § 11.11 Binary Logical Operators
164 '&&' => self::TYPE_BIN_OP,
165 '||' => self::TYPE_BIN_OP,
166 // ECMAScript 5.1 § 11.12 Conditional Operator
167 // Also known as ternary.
168 '?' => self::TYPE_HOOK,
169 ':' => self::TYPE_COLON,
170 // ECMAScript 5.1 § 11.13 Assignment Operators
171 '=' => self::TYPE_BIN_OP,
172 '*=' => self::TYPE_BIN_OP,
173 '/=' => self::TYPE_BIN_OP,
174 '%=' => self::TYPE_BIN_OP,
175 '+=' => self::TYPE_BIN_OP,
176 '-=' => self::TYPE_BIN_OP,
177 '<<=' => self::TYPE_BIN_OP,
178 '>>=' => self::TYPE_BIN_OP,
179 '>>>=' => self::TYPE_BIN_OP,
180 '&=' => self::TYPE_BIN_OP,
181 '^=' => self::TYPE_BIN_OP,
182 '|=' => self::TYPE_BIN_OP,
183 // ECMAScript 5.1 § 11.14 Comma Operator
184 ',' => self::TYPE_COMMA,
185
186 // The keywords that disallow LineTerminator before their
187 // (sometimes optional) Expression or Identifier.
188 //
189 // keyword ;
190 // keyword [no LineTerminator here] Identifier ;
191 // keyword [no LineTerminator here] Expression ;
192 //
193 // See also ECMAScript 5.1:
194 // - § 12.7 The continue Statement
195 // - $ 12.8 The break Statement
196 // - § 12.9 The return Statement
197 // - § 12.13 The throw Statement
198 'continue' => self::TYPE_RETURN,
199 'break' => self::TYPE_RETURN,
200 'return' => self::TYPE_RETURN,
201 'throw' => self::TYPE_RETURN,
202
203 // The keywords require a parenthesised Expression or Identifier
204 // before the next Statement.
205 //
206 // keyword ( Expression ) Statement
207 // keyword ( Identifier ) Statement
208 //
209 // See also ECMAScript 5.1:
210 // - § 12.5 The if Statement
211 // - § 12.6 Iteration Statements (do, while, for)
212 // - § 12.10 The with Statement
213 // - § 12.11 The switch Statement
214 // - § 12.13 The throw Statement
215 'if' => self::TYPE_IF,
216 'catch' => self::TYPE_IF,
217 'while' => self::TYPE_IF,
218 'for' => self::TYPE_IF,
219 'switch' => self::TYPE_IF,
220 'with' => self::TYPE_IF,
221
222 // The keywords followed by an Identifier, Statement,
223 // Expression, or Block.
224 //
225 // var Identifier
226 // else Statement
227 // do Statement
228 // case Expression
229 // try Block
230 // finally Block
231 //
232 // See also ECMAScript 5.1:
233 // - § 12.2 Variable Statement
234 // - § 12.5 The if Statement (else)
235 // - § 12.6 Iteration Statements (do, while, for)
236 // - § 12.11 The switch Statement (case)
237 // - § 12.14 The try Statement
238 'var' => self::TYPE_DO,
239 'else' => self::TYPE_DO,
240 'do' => self::TYPE_DO,
241 'case' => self::TYPE_DO,
242 'try' => self::TYPE_DO,
243 'finally' => self::TYPE_DO,
244
245 // ECMAScript 5.1 § 13 Function Definition
246 'function' => self::TYPE_FUNC,
247
248 // Can be one of:
249 // - DecimalLiteral (ECMAScript 5.1 § 7.8.3 Numeric Literals)
250 // - MemberExpression (ECMAScript 5.1 § 11.2 Left-Hand-Side Expressions)
251 '.' => self::TYPE_BIN_OP,
252
253 // Can be one of:
254 // - Block (ECMAScript 5.1 § 12.1 Block)
255 // - ObjectLiteral (ECMAScript 5.1 § 11.1 Primary Expressions)
256 '{' => self::TYPE_BRACE_OPEN,
257 '}' => self::TYPE_BRACE_CLOSE,
258
259 // Can be one of:
260 // - Parenthesised Identifier or Expression after a
261 // TYPE_IF or TYPE_FUNC keyword.
262 // - PrimaryExpression (ECMAScript 5.1 § 11.1 Primary Expressions)
263 // - CallExpression (ECMAScript 5.1 § 11.2 Left-Hand-Side Expressions)
264 '(' => self::TYPE_PAREN_OPEN,
265 ')' => self::TYPE_PAREN_CLOSE,
266
267 // Can be one of:
268 // - ArrayLiteral (ECMAScript 5.1 § 11.1 Primary Expressions)
269 '[' => self::TYPE_PAREN_OPEN,
270 ']' => self::TYPE_PAREN_CLOSE,
271
272 // Can be one of:
273 // - End of any statement
274 // - EmptyStatement (ECMAScript 5.1 § 12.3 Empty Statement)
275 ';' => self::TYPE_SEMICOLON,
276 ];
277
278 // $goto : This is the main table for our state machine. For every state/token pair
279 // the following state is defined. When no rule exists for a given pair,
280 // the state is left unchanged.
281 $goto = [
282 self::STATEMENT => [
283 self::TYPE_UN_OP => self::EXPRESSION,
284 self::TYPE_INCR_OP => self::EXPRESSION,
285 self::TYPE_ADD_OP => self::EXPRESSION,
286 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
287 self::TYPE_RETURN => self::EXPRESSION_NO_NL,
288 self::TYPE_IF => self::CONDITION,
289 self::TYPE_FUNC => self::CONDITION,
290 self::TYPE_LITERAL => self::EXPRESSION_OP
291 ],
292 self::CONDITION => [
293 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
294 ],
295 self::PROPERTY_ASSIGNMENT => [
296 self::TYPE_COLON => self::PROPERTY_EXPRESSION,
297 self::TYPE_BRACE_OPEN => self::STATEMENT
298 ],
299 self::EXPRESSION => [
300 self::TYPE_SEMICOLON => self::STATEMENT,
301 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
302 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
303 self::TYPE_FUNC => self::EXPRESSION_FUNC,
304 self::TYPE_LITERAL => self::EXPRESSION_OP
305 ],
306 self::EXPRESSION_NO_NL => [
307 self::TYPE_SEMICOLON => self::STATEMENT,
308 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
309 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
310 self::TYPE_FUNC => self::EXPRESSION_FUNC,
311 self::TYPE_LITERAL => self::EXPRESSION_OP
312 ],
313 self::EXPRESSION_OP => [
314 self::TYPE_BIN_OP => self::EXPRESSION,
315 self::TYPE_ADD_OP => self::EXPRESSION,
316 self::TYPE_HOOK => self::EXPRESSION_TERNARY,
317 self::TYPE_COLON => self::STATEMENT,
318 self::TYPE_COMMA => self::EXPRESSION,
319 self::TYPE_SEMICOLON => self::STATEMENT,
320 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
321 ],
322 self::EXPRESSION_FUNC => [
323 self::TYPE_BRACE_OPEN => self::STATEMENT
324 ],
325 self::EXPRESSION_TERNARY => [
326 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
327 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
328 self::TYPE_FUNC => self::EXPRESSION_TERNARY_FUNC,
329 self::TYPE_LITERAL => self::EXPRESSION_TERNARY_OP
330 ],
331 self::EXPRESSION_TERNARY_OP => [
332 self::TYPE_BIN_OP => self::EXPRESSION_TERNARY,
333 self::TYPE_ADD_OP => self::EXPRESSION_TERNARY,
334 self::TYPE_HOOK => self::EXPRESSION_TERNARY,
335 self::TYPE_COMMA => self::EXPRESSION_TERNARY,
336 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
337 ],
338 self::EXPRESSION_TERNARY_FUNC => [
339 self::TYPE_BRACE_OPEN => self::STATEMENT
340 ],
341 self::PAREN_EXPRESSION => [
342 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
343 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
344 self::TYPE_FUNC => self::PAREN_EXPRESSION_FUNC,
345 self::TYPE_LITERAL => self::PAREN_EXPRESSION_OP
346 ],
347 self::PAREN_EXPRESSION_OP => [
348 self::TYPE_BIN_OP => self::PAREN_EXPRESSION,
349 self::TYPE_ADD_OP => self::PAREN_EXPRESSION,
350 self::TYPE_HOOK => self::PAREN_EXPRESSION,
351 self::TYPE_COLON => self::PAREN_EXPRESSION,
352 self::TYPE_COMMA => self::PAREN_EXPRESSION,
353 self::TYPE_SEMICOLON => self::PAREN_EXPRESSION,
354 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
355 ],
356 self::PAREN_EXPRESSION_FUNC => [
357 self::TYPE_BRACE_OPEN => self::STATEMENT
358 ],
359 self::PROPERTY_EXPRESSION => [
360 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
361 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
362 self::TYPE_FUNC => self::PROPERTY_EXPRESSION_FUNC,
363 self::TYPE_LITERAL => self::PROPERTY_EXPRESSION_OP
364 ],
365 self::PROPERTY_EXPRESSION_OP => [
366 self::TYPE_BIN_OP => self::PROPERTY_EXPRESSION,
367 self::TYPE_ADD_OP => self::PROPERTY_EXPRESSION,
368 self::TYPE_HOOK => self::PROPERTY_EXPRESSION,
369 self::TYPE_COMMA => self::PROPERTY_ASSIGNMENT,
370 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
371 ],
372 self::PROPERTY_EXPRESSION_FUNC => [
373 self::TYPE_BRACE_OPEN => self::STATEMENT
374 ]
375 ];
376
377 // $push : This table contains the rules for when to push a state onto the stack.
378 // The pushed state is the state to return to when the corresponding
379 // closing token is found
380 $push = [
381 self::STATEMENT => [
382 self::TYPE_BRACE_OPEN => self::STATEMENT,
383 self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
384 ],
385 self::CONDITION => [
386 self::TYPE_PAREN_OPEN => self::STATEMENT
387 ],
388 self::PROPERTY_ASSIGNMENT => [
389 self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT
390 ],
391 self::EXPRESSION => [
392 self::TYPE_BRACE_OPEN => self::EXPRESSION_OP,
393 self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
394 ],
395 self::EXPRESSION_NO_NL => [
396 self::TYPE_BRACE_OPEN => self::EXPRESSION_OP,
397 self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
398 ],
399 self::EXPRESSION_OP => [
400 self::TYPE_HOOK => self::EXPRESSION,
401 self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
402 ],
403 self::EXPRESSION_FUNC => [
404 self::TYPE_BRACE_OPEN => self::EXPRESSION_OP
405 ],
406 self::EXPRESSION_TERNARY => [
407 self::TYPE_BRACE_OPEN => self::EXPRESSION_TERNARY_OP,
408 self::TYPE_PAREN_OPEN => self::EXPRESSION_TERNARY_OP
409 ],
410 self::EXPRESSION_TERNARY_OP => [
411 self::TYPE_HOOK => self::EXPRESSION_TERNARY,
412 self::TYPE_PAREN_OPEN => self::EXPRESSION_TERNARY_OP
413 ],
414 self::EXPRESSION_TERNARY_FUNC => [
415 self::TYPE_BRACE_OPEN => self::EXPRESSION_TERNARY_OP
416 ],
417 self::PAREN_EXPRESSION => [
418 self::TYPE_BRACE_OPEN => self::PAREN_EXPRESSION_OP,
419 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION_OP
420 ],
421 self::PAREN_EXPRESSION_OP => [
422 self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION_OP
423 ],
424 self::PAREN_EXPRESSION_FUNC => [
425 self::TYPE_BRACE_OPEN => self::PAREN_EXPRESSION_OP
426 ],
427 self::PROPERTY_EXPRESSION => [
428 self::TYPE_BRACE_OPEN => self::PROPERTY_EXPRESSION_OP,
429 self::TYPE_PAREN_OPEN => self::PROPERTY_EXPRESSION_OP
430 ],
431 self::PROPERTY_EXPRESSION_OP => [
432 self::TYPE_PAREN_OPEN => self::PROPERTY_EXPRESSION_OP
433 ],
434 self::PROPERTY_EXPRESSION_FUNC => [
435 self::TYPE_BRACE_OPEN => self::PROPERTY_EXPRESSION_OP
436 ]
437 ];
438
439 // $pop : Rules for when to pop a state from the stack
440 $pop = [
441 self::STATEMENT => [ self::TYPE_BRACE_CLOSE => true ],
442 self::PROPERTY_ASSIGNMENT => [ self::TYPE_BRACE_CLOSE => true ],
443 self::EXPRESSION => [ self::TYPE_BRACE_CLOSE => true ],
444 self::EXPRESSION_NO_NL => [ self::TYPE_BRACE_CLOSE => true ],
445 self::EXPRESSION_OP => [ self::TYPE_BRACE_CLOSE => true ],
446 self::EXPRESSION_TERNARY_OP => [ self::TYPE_COLON => true ],
447 self::PAREN_EXPRESSION => [ self::TYPE_PAREN_CLOSE => true ],
448 self::PAREN_EXPRESSION_OP => [ self::TYPE_PAREN_CLOSE => true ],
449 self::PROPERTY_EXPRESSION => [ self::TYPE_BRACE_CLOSE => true ],
450 self::PROPERTY_EXPRESSION_OP => [ self::TYPE_BRACE_CLOSE => true ]
451 ];
452
453 // $semicolon : Rules for when a semicolon insertion is appropriate
454 $semicolon = [
455 self::EXPRESSION_NO_NL => [
456 self::TYPE_UN_OP => true,
457 self::TYPE_INCR_OP => true,
458 self::TYPE_ADD_OP => true,
459 self::TYPE_BRACE_OPEN => true,
460 self::TYPE_PAREN_OPEN => true,
461 self::TYPE_RETURN => true,
462 self::TYPE_IF => true,
463 self::TYPE_DO => true,
464 self::TYPE_FUNC => true,
465 self::TYPE_LITERAL => true
466 ],
467 self::EXPRESSION_OP => [
468 self::TYPE_UN_OP => true,
469 self::TYPE_INCR_OP => true,
470 self::TYPE_BRACE_OPEN => true,
471 self::TYPE_RETURN => true,
472 self::TYPE_IF => true,
473 self::TYPE_DO => true,
474 self::TYPE_FUNC => true,
475 self::TYPE_LITERAL => true
476 ]
477 ];
478
479 // $divStates : Contains all states that can be followed by a division operator
480 $divStates = [
481 self::EXPRESSION_OP => true,
482 self::EXPRESSION_TERNARY_OP => true,
483 self::PAREN_EXPRESSION_OP => true,
484 self::PROPERTY_EXPRESSION_OP => true
485 ];
486
487 // Here's where the minifying takes place: Loop through the input, looking for tokens
488 // and output them to $out, taking actions to the above defined rules when appropriate.
489 $out = '';
490 $pos = 0;
491 $length = strlen( $s );
492 $lineLength = 0;
493 $newlineFound = true;
494 $state = self::STATEMENT;
495 $stack = [];
496 $last = ';'; // Pretend that we have seen a semicolon yet
497 while ( $pos < $length ) {
498 // First, skip over any whitespace and multiline comments, recording whether we
499 // found any newline character
500 $skip = strspn( $s, " \t\n\r\xb\xc", $pos );
501 if ( !$skip ) {
502 $ch = $s[$pos];
503 if ( $ch === '/' && substr( $s, $pos, 2 ) === '/*' ) {
504 // Multiline comment. Search for the end token or EOT.
505 $end = strpos( $s, '*/', $pos + 2 );
506 $skip = $end === false ? $length - $pos : $end - $pos + 2;
507 }
508 }
509 if ( $skip ) {
510 // The semicolon insertion mechanism needs to know whether there was a newline
511 // between two tokens, so record it now.
512 if ( !$newlineFound && strcspn( $s, "\r\n", $pos, $skip ) !== $skip ) {
513 $newlineFound = true;
514 }
515 $pos += $skip;
516 continue;
517 }
518 // Handle C++-style comments and html comments, which are treated as single line
519 // comments by the browser, regardless of whether the end tag is on the same line.
520 // Handle --> the same way, but only if it's at the beginning of the line
521 if ( ( $ch === '/' && substr( $s, $pos, 2 ) === '//' )
522 || ( $ch === '<' && substr( $s, $pos, 4 ) === '<!--' )
523 || ( $ch === '-' && $newlineFound && substr( $s, $pos, 3 ) === '-->' )
524 ) {
525 $pos += strcspn( $s, "\r\n", $pos );
526 continue;
527 }
528
529 // Find out which kind of token we're handling.
530 // Note: $end must point past the end of the current token
531 // so that `substr($s, $pos, $end - $pos)` would be the entire token.
532 // In order words, $end will be the offset of the last relevant character
533 // in the stream + 1, or simply put: The offset of the first character
534 // of any next token in the stream.
535 $end = $pos + 1;
536 // Handle string literals
537 if ( $ch === "'" || $ch === '"' ) {
538 // Search to the end of the string literal, skipping over backslash escapes
539 $search = $ch . '\\';
540 do{
541 // Speculatively add 2 to the end so that if we see a backslash,
542 // the next iteration will start 2 characters further (one for the
543 // backslash, one for the escaped character).
544 // We'll correct this outside the loop.
545 $end += strcspn( $s, $search, $end ) + 2;
546 // If the last character in our search for a quote or a backlash
547 // matched a backslash and we haven't reached the end, keep searching..
548 } while ( $end - 2 < $length && $s[$end - 2] === '\\' );
549 // Correction (1): Undo speculative add, keep only one (end of string literal)
550 $end--;
551 if ( $end > $length ) {
552 // Correction (2): Loop wrongly assumed an end quote ended the search,
553 // but search ended because we've reached the end. Correct $end.
554 // TODO: This is invalid and should throw.
555 $end--;
556 }
557 // We have to distinguish between regexp literals and division operators
558 // A division operator is only possible in certain states
559 } elseif ( $ch === '/' && !isset( $divStates[$state] ) ) {
560 // Regexp literal
561 for ( ; ; ) {
562 // Search until we find "/" (end of regexp), "\" (backslash escapes),
563 // or "[" (start of character classes).
564 do{
565 // Speculatively add 2 to ensure next iteration skips
566 // over backslash and escaped character.
567 // We'll correct this outside the loop.
568 $end += strcspn( $s, '/[\\', $end ) + 2;
569 // If backslash escape, keep searching...
570 } while ( $end - 2 < $length && $s[$end - 2] === '\\' );
571 // Correction (1): Undo speculative add, keep only one (end of regexp)
572 $end--;
573 if ( $end > $length ) {
574 // Correction (2): Loop wrongly assumed end slash was seen
575 // String ended without end of regexp. Correct $end.
576 // TODO: This is invalid and should throw.
577 $end--;
578 break;
579 }
580 if ( $s[$end - 1] === '/' ) {
581 break;
582 }
583 // (Implicit else), we must've found the start of a char class,
584 // skip until we find "]" (end of char class), or "\" (backslash escape)
585 do{
586 // Speculatively add 2 for backslash escape.
587 // We'll substract one outside the loop.
588 $end += strcspn( $s, ']\\', $end ) + 2;
589 // If backslash escape, keep searching...
590 } while ( $end - 2 < $length && $s[$end - 2] === '\\' );
591 // Correction (1): Undo speculative add, keep only one (end of regexp)
592 $end--;
593 if ( $end > $length ) {
594 // Correction (2): Loop wrongly assumed "]" was seen
595 // String ended without ending char class or regexp. Correct $end.
596 // TODO: This is invalid and should throw.
597 $end--;
598 break;
599 }
600 }
601 // Search past the regexp modifiers (gi)
602 while ( $end < $length && ctype_alpha( $s[$end] ) ) {
603 $end++;
604 }
605 } elseif (
606 $ch === '0'
607 && ( $pos + 1 < $length ) && ( $s[$pos + 1] === 'x' || $s[$pos + 1] === 'X' )
608 ) {
609 // Hex numeric literal
610 $end++; // x or X
611 $len = strspn( $s, '0123456789ABCDEFabcdef', $end );
612 if ( !$len ) {
613 return self::parseError(
614 $s,
615 $pos,
616 'Expected a hexadecimal number but found ' . substr( $s, $pos, 5 ) . '...'
617 );
618 }
619 $end += $len;
620 } elseif (
621 ctype_digit( $ch )
622 || ( $ch === '.' && $pos + 1 < $length && ctype_digit( $s[$pos + 1] ) )
623 ) {
624 $end += strspn( $s, '0123456789', $end );
625 $decimal = strspn( $s, '.', $end );
626 if ( $decimal ) {
627 if ( $decimal > 2 ) {
628 return self::parseError( $s, $end, 'The number has too many decimal points' );
629 }
630 $end += strspn( $s, '0123456789', $end + 1 ) + $decimal;
631 }
632 $exponent = strspn( $s, 'eE', $end );
633 if ( $exponent ) {
634 if ( $exponent > 1 ) {
635 return self::parseError( $s, $end, 'Number with several E' );
636 }
637 $end++;
638
639 // + sign is optional; - sign is required.
640 $end += strspn( $s, '-+', $end );
641 $len = strspn( $s, '0123456789', $end );
642 if ( !$len ) {
643 return self::parseError(
644 $s,
645 $pos,
646 'No decimal digits after e, how many zeroes should be added?'
647 );
648 }
649 $end += $len;
650 }
651 } elseif ( isset( $opChars[$ch] ) ) {
652 // Punctuation character. Search for the longest matching operator.
653 while (
654 $end < $length
655 && isset( $tokenTypes[substr( $s, $pos, $end - $pos + 1 )] )
656 ) {
657 $end++;
658 }
659 } else {
660 // Identifier or reserved word. Search for the end by excluding whitespace and
661 // punctuation.
662 $end += strcspn( $s, " \t\n.;,=<>+-{}()[]?:*/%'\"!&|^~\xb\xc\r", $end );
663 }
664
665 // Now get the token type from our type array
666 $token = substr( $s, $pos, $end - $pos ); // so $end - $pos == strlen( $token )
667 $type = $tokenTypes[$token] ?? self::TYPE_LITERAL;
668
669 if ( $newlineFound && isset( $semicolon[$state][$type] ) ) {
670 // This token triggers the semicolon insertion mechanism of javascript. While we
671 // could add the ; token here ourselves, keeping the newline has a few advantages.
672 $out .= "\n";
673 $state = self::STATEMENT;
674 $lineLength = 0;
675 } elseif ( $lineLength + $end - $pos > self::MAX_LINE_LENGTH &&
676 !isset( $semicolon[$state][$type] ) && $type !== self::TYPE_INCR_OP ) {
677 // This line would get too long if we added $token, so add a newline first.
678 // Only do this if it won't trigger semicolon insertion and if it won't
679 // put a postfix increment operator on its own line, which is illegal in js.
680 $out .= "\n";
681 $lineLength = 0;
682 // Check, whether we have to separate the token from the last one with whitespace
683 } elseif ( !isset( $opChars[$last] ) && !isset( $opChars[$ch] ) ) {
684 $out .= ' ';
685 $lineLength++;
686 // Don't accidentally create ++, -- or // tokens
687 } elseif ( $last === $ch && ( $ch === '+' || $ch === '-' || $ch === '/' ) ) {
688 $out .= ' ';
689 $lineLength++;
690 }
691 if (
692 $type === self::TYPE_LITERAL
693 && ( $token === 'true' || $token === 'false' )
694 && ( $state === self::EXPRESSION || $state === self::PROPERTY_EXPRESSION )
695 && $last !== '.'
696 ) {
697 $token = ( $token === 'true' ) ? '!0' : '!1';
698 }
699
700 $out .= $token;
701 $lineLength += $end - $pos; // += strlen( $token )
702 $last = $s[$end - 1];
703 $pos = $end;
704 $newlineFound = false;
705
706 // Now that we have output our token, transition into the new state.
707 if ( isset( $push[$state][$type] ) && count( $stack ) < self::STACK_LIMIT ) {
708 $stack[] = $push[$state][$type];
709 }
710 if ( $stack && isset( $pop[$state][$type] ) ) {
711 $state = array_pop( $stack );
712 } elseif ( isset( $goto[$state][$type] ) ) {
713 $state = $goto[$state][$type];
714 }
715 }
716 return $out;
717 }
718
719 static function parseError( $fullJavascript, $position, $errorMsg ) {
720 // TODO: Handle the error: trigger_error, throw exception, return false...
721 return false;
722 }
723 }