3 * Parse and evaluate a plural rule.
6 * http://www.unicode.org/reports/tr35/tr35-33/tr35-numbers.html#Language_Plural_Rules
8 * @author Niklas Laxstrom, Tim Starling
10 * @copyright Copyright © 2010-2012, Niklas Laxström
11 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 * http://www.gnu.org/copyleft/gpl.html
33 class CLDRPluralRuleEvaluator
{
35 * Evaluate a number against a set of plural rules. If a rule passes,
36 * return the index of plural rule.
38 * @param int The number to be evaluated against the rules
39 * @param array The associative array of plural rules in pluralform => rule format.
40 * @return int The index of the plural form which passed the evaluation
42 public static function evaluate( $number, array $rules ) {
43 $rules = self
::compile( $rules );
44 return self
::evaluateCompiled( $number, $rules );
48 * Convert a set of rules to a compiled form which is optimised for
49 * fast evaluation. The result will be an array of strings, and may be cached.
51 * @param array $rules The rules to compile
52 * @return array An array of compile rules.
54 public static function compile( array $rules ) {
55 // We can't use array_map() for this because it generates a warning if
56 // there is an exception.
57 foreach ( $rules as &$rule ) {
58 $rule = CLDRPluralRuleConverter
::convert( $rule );
64 * Evaluate a compiled set of rules returned by compile(). Do not allow
65 * the user to edit the compiled form, or else PHP errors may result.
67 * @param string The number to be evaluated against the rules, in English, or it
68 * may be a type convertible to string.
69 * @param array The associative array of plural rules in pluralform => rule format.
70 * @return int The index of the plural form which passed the evaluation
72 public static function evaluateCompiled( $number, array $rules ) {
73 // Calculate the values of the operand symbols
74 $number = strval( $number );
75 if ( !preg_match( '/^ -? ( ([0-9]+) (?: \. ([0-9]+) )? )$/x', $number, $m ) ) {
76 wfDebug( __METHOD__
.': invalid number input, returning "other"' );
77 return count( $rules );
79 if ( !isset( $m[3] ) ) {
80 $operandSymbols = array(
81 'n' => intval( $m[1] ),
82 'i' => intval( $m[1] ),
92 $operandSymbols = array(
93 'n' => floatval( $absValStr ),
94 'i' => intval( $intStr ),
95 'v' => strlen( $fracStr ),
96 'w' => strlen( rtrim( $fracStr, '0' ) ),
97 'f' => intval( $fracStr ),
98 't' => intval( rtrim( $fracStr, '0' ) ),
102 // The compiled form is RPN, with tokens strictly delimited by
103 // spaces, so this is a simple RPN evaluator.
104 foreach ( $rules as $i => $rule ) {
108 foreach ( StringUtils
::explode( ' ', $rule ) as $token ) {
109 $ord = ord( $token );
110 if ( isset( $operandSymbols[$token] ) ) {
111 $stack[] = $operandSymbols[$token];
112 } elseif ( $ord >= $zero && $ord <= $nine ) {
113 $stack[] = intval( $token );
115 $right = array_pop( $stack );
116 $left = array_pop( $stack );
117 $result = self
::doOperation( $token, $left, $right );
125 // None of the provided rules match. The number belongs to category
126 // 'other', which comes last.
127 return count( $rules );
131 * Do a single operation
133 * @param string $token The token string
134 * @param mixed $left The left operand. If it is an object, its state may be destroyed.
135 * @param mixed $right The right operand
136 * @throws CLDRPluralRuleError
137 * @return mixed The operation result
139 private static function doOperation( $token, $left, $right ) {
140 if ( in_array( $token, array( 'in', 'not-in', 'within', 'not-within' ) ) ) {
141 if ( !( $right instanceof CLDRPluralRuleEvaluator_Range
) ) {
142 $right = new CLDRPluralRuleEvaluator_Range( $right );
147 return $left ||
$right;
149 return $left && $right;
151 return $left == $right;
153 return $left != $right;
155 return $right->isNumberIn( $left );
157 return !$right->isNumberIn( $left );
159 return $right->isNumberWithin( $left );
161 return !$right->isNumberWithin( $left );
163 if ( is_int( $left ) ) {
164 return (int)fmod( $left, $right );
166 return fmod( $left, $right );
168 if ( $left instanceof CLDRPluralRuleEvaluator_Range
) {
171 $range = new CLDRPluralRuleEvaluator_Range( $left );
173 $range->add( $right );
176 return new CLDRPluralRuleEvaluator_Range( $left, $right );
178 throw new CLDRPluralRuleError( "Invalid RPN token" );
184 * Evaluator helper class representing a range list.
186 class CLDRPluralRuleEvaluator_Range
{
192 public $parts = array();
195 * Initialize a new instance of CLDRPluralRuleEvaluator_Range
197 * @param int $start The start of the range
198 * @param int|bool $end The end of the range, or false if the range is not bounded.
200 function __construct( $start, $end = false ) {
201 if ( $end === false ) {
202 $this->parts
[] = $start;
204 $this->parts
[] = array( $start, $end );
209 * Determine if the given number is inside the range.
211 * @param int $number The number to check
212 * @param bool $integerConstraint If true, also asserts the number is an integer; otherwise, number simply has to be inside the range.
213 * @return bool True if the number is inside the range; otherwise, false.
215 function isNumberIn( $number, $integerConstraint = true ) {
216 foreach ( $this->parts
as $part ) {
217 if ( is_array( $part ) ) {
218 if ( ( !$integerConstraint ||
floor( $number ) === (float)$number )
219 && $number >= $part[0] && $number <= $part[1]
224 if ( $number == $part ) {
233 * Readable alias for isNumberIn( $number, false ), and the implementation
234 * of the "within" operator.
236 * @param int $number The number to check
237 * @return bool True if the number is inside the range; otherwise, false.
239 function isNumberWithin( $number ) {
240 return $this->isNumberIn( $number, false );
244 * Add another part to this range.
246 * @param mixed The part to add, either a range object itself or a single number.
248 function add( $other ) {
249 if ( $other instanceof self
) {
250 $this->parts
= array_merge( $this->parts
, $other->parts
);
252 $this->parts
[] = $other;
257 * Returns the string representation of the rule evaluator range.
258 * The purpose of this method is to help debugging.
260 * @return string The string representation of the rule evaluator range
262 function __toString() {
264 foreach ( $this->parts
as $i => $part ) {
268 if ( is_array( $part ) ) {
269 $s .= $part[0] . '..' . $part[1];
281 * Helper class for converting rules to reverse polish notation (RPN).
283 class CLDRPluralRuleConverter
{
292 * The current position
299 * The past-the-end position
310 public $operators = array();
317 public $operands = array();
320 * Precedence levels. Note that there's no need to worry about associativity
321 * for the level 4 operators, since they return boolean and don't accept
324 static $precedence = array(
339 * A character list defining whitespace, for use in strspn() etc.
341 const WHITESPACE_CLASS
= " \t\r\n";
344 * Same for digits. Note that the grammar given in UTS #35 doesn't allow
345 * negative numbers or decimal separators.
347 const NUMBER_CLASS
= '0123456789';
350 * A character list of symbolic operands.
352 const OPERAND_SYMBOLS
= 'nivwft';
355 * An anchored regular expression which matches a word at the current offset.
357 const WORD_REGEX
= '/[a-zA-Z@]+/A';
360 * Convert a rule to RPN. This is the only public entry point.
362 * @param $rule The rule to convert
363 * @return string The RPN representation of the rule
365 public static function convert( $rule ) {
366 $parser = new self( $rule );
367 return $parser->doConvert();
371 * Private constructor.
373 protected function __construct( $rule ) {
376 $this->end
= strlen( $rule );
382 * @return string The RPN representation of the rule (e.g. "5 3 mod n is")
384 protected function doConvert() {
385 $expectOperator = true;
387 // Iterate through all tokens, saving the operators and operands to a
388 // stack per Dijkstra's shunting yard algorithm.
389 while ( false !== ( $token = $this->nextToken() ) ) {
390 // In this grammar, there are only binary operators, so every valid
391 // rule string will alternate between operator and operand tokens.
392 $expectOperator = !$expectOperator;
394 if ( $token instanceof CLDRPluralRuleConverter_Expression
) {
396 if ( $expectOperator ) {
397 $token->error( 'unexpected operand' );
399 $this->operands
[] = $token;
403 if ( !$expectOperator ) {
404 $token->error( 'unexpected operator' );
406 // Resolve higher precedence levels
407 $lastOp = end( $this->operators
);
408 while ( $lastOp && self
::$precedence[$token->name
] <= self
::$precedence[$lastOp->name
] ) {
409 $this->doOperation( $lastOp, $this->operands
);
410 array_pop( $this->operators
);
411 $lastOp = end( $this->operators
);
413 $this->operators
[] = $token;
417 // Finish off the stack
418 while ( $op = array_pop( $this->operators
) ) {
419 $this->doOperation( $op, $this->operands
);
422 // Make sure the result is sane. The first case is possible for an empty
423 // string input, the second should be unreachable.
424 if ( !count( $this->operands
) ) {
425 $this->error( 'condition expected' );
426 } elseif ( count( $this->operands
) > 1 ) {
427 $this->error( 'missing operator or too many operands' );
430 $value = $this->operands
[0];
431 if ( $value->type
!== 'boolean' ) {
432 $this->error( 'the result must have a boolean type' );
435 return $this->operands
[0]->rpn
;
439 * Fetch the next token from the input string.
441 * @return CLDRPluralRuleConverter_Fragment The next token
443 protected function nextToken() {
444 if ( $this->pos
>= $this->end
) {
449 $length = strspn( $this->rule
, self
::WHITESPACE_CLASS
, $this->pos
);
450 $this->pos +
= $length;
452 if ( $this->pos
>= $this->end
) {
457 $length = strspn( $this->rule
, self
::NUMBER_CLASS
, $this->pos
);
458 if ( $length !== 0 ) {
459 $token = $this->newNumber( substr( $this->rule
, $this->pos
, $length ), $this->pos
);
460 $this->pos +
= $length;
464 // Two-character operators
465 $op2 = substr( $this->rule
, $this->pos
, 2 );
466 if ( $op2 === '..' ||
$op2 === '!=' ) {
467 $token = $this->newOperator( $op2, $this->pos
, 2 );
472 // Single-character operators
473 $op1 = $this->rule
[$this->pos
];
474 if ( $op1 === ',' ||
$op1 === '=' ||
$op1 === '%' ) {
475 $token = $this->newOperator( $op1, $this->pos
, 1 );
481 if ( !preg_match( self
::WORD_REGEX
, $this->rule
, $m, 0, $this->pos
) ) {
482 $this->error( 'unexpected character "' . $this->rule
[$this->pos
] . '"' );
484 $word1 = strtolower( $m[0] );
486 $nextTokenPos = $this->pos +
strlen( $word1 );
487 if ( $word1 === 'not' ||
$word1 === 'is' ) {
488 // Look ahead one word
489 $nextTokenPos +
= strspn( $this->rule
, self
::WHITESPACE_CLASS
, $nextTokenPos );
490 if ( $nextTokenPos < $this->end
491 && preg_match( self
::WORD_REGEX
, $this->rule
, $m, 0, $nextTokenPos )
493 $word2 = strtolower( $m[0] );
494 $nextTokenPos +
= strlen( $word2 );
498 // Two-word operators like "is not" take precedence over single-word operators like "is"
499 if ( $word2 !== '' ) {
500 $bothWords = "{$word1}-{$word2}";
501 if ( isset( self
::$precedence[$bothWords] ) ) {
502 $token = $this->newOperator( $bothWords, $this->pos
, $nextTokenPos - $this->pos
);
503 $this->pos
= $nextTokenPos;
508 // Single-word operators
509 if ( isset( self
::$precedence[$word1] ) ) {
510 $token = $this->newOperator( $word1, $this->pos
, strlen( $word1 ) );
511 $this->pos +
= strlen( $word1 );
515 // The single-character operand symbols
516 if ( strpos( self
::OPERAND_SYMBOLS
, $word1 ) !== false ) {
517 $token = $this->newNumber( $word1, $this->pos
);
523 if ( $word1 === '@integer' ||
$word1 === '@decimal' ) {
524 // Samples are like comments, they have no effect on rule evaluation.
525 // They run from the first sample indicator to the end of the string.
526 $this->pos
= $this->end
;
530 $this->error( 'unrecognised word' );
534 * For the binary operator $op, pop its operands off the stack and push
535 * a fragment with rpn and type members describing the result of that
538 protected function doOperation( $op ) {
539 if ( count( $this->operands
) < 2 ) {
540 $op->error( 'missing operand' );
542 $right = array_pop( $this->operands
);
543 $left = array_pop( $this->operands
);
544 $result = $op->operate( $left, $right );
545 $this->operands
[] = $result;
549 * Create a numerical expression object
551 * @return CLDRPluralRuleConverter_Expression The numerical expression
553 protected function newNumber( $text, $pos ) {
554 return new CLDRPluralRuleConverter_Expression( $this, 'number', $text, $pos, strlen( $text ) );
558 * Create a binary operator
560 * @return CLDRPluralRuleConverter_Operator The operator
562 protected function newOperator( $type, $pos, $length ) {
563 return new CLDRPluralRuleConverter_Operator( $this, $type, $pos, $length );
569 protected function error( $message ) {
570 throw new CLDRPluralRuleError( $message );
575 * Helper for CLDRPluralRuleConverter.
576 * The base class for operators and expressions, describing a region of the input string.
578 class CLDRPluralRuleConverter_Fragment
{
579 public $parser, $pos, $length, $end;
581 function __construct( $parser, $pos, $length ) {
582 $this->parser
= $parser;
584 $this->length
= $length;
585 $this->end
= $pos +
$length;
588 public function error( $message ) {
589 $text = $this->getText();
590 throw new CLDRPluralRuleError( "$message at position " . ( $this->pos +
1 ) . ": \"$text\"" );
593 public function getText() {
594 return substr( $this->parser
->rule
, $this->pos
, $this->length
);
599 * Helper for CLDRPluralRuleConverter.
600 * An expression object, representing a region of the input string (for error
601 * messages), the RPN notation used to evaluate it, and the result type for
604 class CLDRPluralRuleConverter_Expression
extends CLDRPluralRuleConverter_Fragment
{
607 function __construct( $parser, $type, $rpn, $pos, $length ) {
608 parent
::__construct( $parser, $pos, $length );
613 public function isType( $type ) {
614 if ( $type === 'range' && ( $this->type
=== 'range' ||
$this->type
=== 'number' ) ) {
617 if ( $type === $this->type
) {
625 * Helper for CLDRPluralRuleConverter.
626 * An operator object, representing a region of the input string (for error
627 * messages), and the binary operator at that location.
629 class CLDRPluralRuleConverter_Operator
extends CLDRPluralRuleConverter_Fragment
{
638 * Each op type has three characters: left operand type, right operand type and result type
644 * A number is a kind of range.
648 static $opTypes = array(
656 'not-within' => 'nrb',
663 * Map converting from the abbrevation to the full form.
667 static $typeSpecMap = array(
674 * Map for converting the new operators introduced in Rev 33 to the old forms
676 static $aliasMap = array(
683 * Initialize a new instance of a CLDRPluralRuleConverter_Operator object
685 * @param CLDRPluralRuleConverter $parser The parser
686 * @param string $name The operator name
687 * @param int $pos The position
688 * @param int $pos The length
690 function __construct( $parser, $name, $pos, $length ) {
691 parent
::__construct( $parser, $pos, $length );
692 if ( isset( self
::$aliasMap[$name] ) ) {
693 $name = self
::$aliasMap[$name];
699 * Compute the operation
701 * @param CLDRPluralRuleConverter_Expression $left The left part of the expression
702 * @param CLDRPluralRuleConverter_Expression $right The right part of the expression
703 * @return CLDRPluralRuleConverter_Expression The result of the operation
705 public function operate( $left, $right ) {
706 $typeSpec = self
::$opTypes[$this->name
];
708 $leftType = self
::$typeSpecMap[$typeSpec[0]];
709 $rightType = self
::$typeSpecMap[$typeSpec[1]];
710 $resultType = self
::$typeSpecMap[$typeSpec[2]];
712 $start = min( $this->pos
, $left->pos
, $right->pos
);
713 $end = max( $this->end
, $left->end
, $right->end
);
714 $length = $end - $start;
716 $newExpr = new CLDRPluralRuleConverter_Expression( $this->parser
, $resultType,
717 "{$left->rpn} {$right->rpn} {$this->name}",
720 if ( !$left->isType( $leftType ) ) {
721 $newExpr->error( "invalid type for left operand: expected $leftType, got {$left->type}" );
724 if ( !$right->isType( $rightType ) ) {
725 $newExpr->error( "invalid type for right operand: expected $rightType, got {$right->type}" );
732 * The exception class for all the classes in this file. This will be thrown
733 * back to the caller if there is any validation error.
735 class CLDRPluralRuleError
extends MWException
{
736 function __construct( $message ) {
737 parent
::__construct( 'CLDR plural rule error: ' . $message );