From: Tim Starling <tstarling@wikimedia.org>
Date: Thu, 16 Aug 2012 10:36:08 +0000 (+1000)
Subject: CLDR plural parser in PHP
X-Git-Tag: 1.31.0-rc.0~22571^2~1
X-Git-Url: http://git.cyclocoop.org/%24action?a=commitdiff_plain;h=fceb9bcb49fdeaaf978a1667ef382aedb6e715d1;p=lhc%2Fweb%2Fwiklou.git

CLDR plural parser in PHP

Wrote a CLDR plural rule parser to replace the eval()-based one from
I58a9cdfe. It converts the infix notation of the XML files to a
sanitized RPN notation, referred to in external interfaces as the
"compiled" form. The RPN notation is cached and then executed by a
fast non-validating evaluator.

Timings for the largest rule in the XML file are ~1.2ms for
compilation and ~200us for execution.

Also:
* Lazy-load the plural rules when recache() requests them, instead of
  loading them for every request.
* Language::convertPlural() needs integer keys, and CLDR only gives
  string keys. The previous code was not mapping them so it didn't work
  at all. I just mapped them in the order they appear in the XML file,
  i.e. the first rule becomes MediaWiki's $pluralForm=0, the second
  becomes $pluralForm=1, etc. Not sure if there is a more rigorous way
  to do it.

Change-Id: I65ee788c1a8e5ee2ede2091990d86eb722749dd3
---

diff --git a/includes/LocalisationCache.php b/includes/LocalisationCache.php
index c9dd69754d..c1ac848490 100644
--- a/includes/LocalisationCache.php
+++ b/includes/LocalisationCache.php
@@ -154,10 +154,11 @@ class LocalisationCache {
 	 */
 	static public $preloadedKeys = array( 'dateFormats', 'namespaceNames' );
 
-	/*
-	 * Associative array containing plural rules.
+	/**
+	 * Associative array of cached plural rules. The key is the language code,
+	 * the value is an array of plural rules for that language.
 	 */
-	var $pluralRules = array();
+	var $pluralRules = null;
 
 	var $mergeableKeys = null;
 
@@ -207,7 +208,6 @@ class LocalisationCache {
 				$this->$var = $conf[$var];
 			}
 		}
-		$this->readPluralRules();
 	}
 
 	/**
@@ -491,36 +491,62 @@ class LocalisationCache {
 		}
 		return $data;
 	}
+
 	/**
-	 * Read the plural rule xml files.
-	 * First the CLDR xml will be read and it will be extended with
-	 * mediawiki specific tailoring.
+	 * Get the compiled plural rules for a given language from the XML files.
 	 * @since 1.20
 	 */
-	protected function readPluralRules() {
-		$CLDRPlural = __DIR__ . "/../languages/data/plurals.xml";
-		$MWPlural = __DIR__ . "/../languages/data/plurals-mediawiki.xml";
-		# Load CLDR plural rules
-		$this->parsePluralXML( $CLDRPlural );
-		if ( file_exists( $MWPlural ) ) {
-			// override or extend.
-			$this->parsePluralXML( $MWPlural );
+	public function getCompiledPluralRules( $code ) {
+		$rules = $this->getPluralRules( $code );
+		try {
+			$compiledRules = CLDRPluralRuleEvaluator::compile( $rules );
+		} catch( CLDRPluralRuleError $e ) {
+			wfDebugLog( 'l10n', $e->getMessage() . "\n" );
+			return array();
 		}
+		return $compiledRules;
 	}
 
-	private function parsePluralXML( $xmlFile ) {
-		$pluraldoc = new DOMDocument();
-		$pluraldoc->load( $xmlFile );
-		$rulesets = $pluraldoc->getElementsByTagName( "pluralRules" );
+	/**
+	 * Get the plural rules for a given language from the XML files.
+	 * Cached.
+	 * @since 1.20
+	 */
+	public function getPluralRules( $code ) {
+		if ( $this->pluralRules === null ) {
+			$cldrPlural = __DIR__ . "/../languages/data/plurals.xml";
+			$mwPlural = __DIR__ . "/../languages/data/plurals-mediawiki.xml";
+			// Load CLDR plural rules
+			$this->loadPluralFile( $cldrPlural );
+			if ( file_exists( $mwPlural ) ) {
+				// Override or extend
+				$this->loadPluralFile( $mwPlural );
+			}
+		}
+		if ( !isset( $this->pluralRules[$code] ) ) {
+			return array();
+		} else {
+			return $this->pluralRules[$code];
+		}
+	}
+
+	/**
+	 * Load a plural XML file with the given filename, compile the relevant
+	 * rules, and save the compiled rules in a process-local cache.
+	 */
+	private function loadPluralFile( $fileName ) {
+		$doc = new DOMDocument;
+		$doc->load( $fileName );
+		$rulesets = $doc->getElementsByTagName( "pluralRules" );
 		foreach ( $rulesets as $ruleset ) {
 			$codes = $ruleset->getAttribute( 'locales' );
-			$parsedRules = array();
-			$rules = $ruleset->getElementsByTagName( "pluralRule" );
-			foreach ( $rules as $rule ) {
-				$parsedRules[$rule->getAttribute( 'count' )] = $rule->nodeValue;
+			$rules = array();
+			$ruleElements = $ruleset->getElementsByTagName( "pluralRule" );
+			foreach ( $ruleElements as $elt ) {
+				$rules[] = $elt->nodeValue;
 			}
 			foreach ( explode( ' ', $codes ) as $code ) {
-				$this->pluralRules[$code] = $parsedRules;
+				$this->pluralRules[$code] = $rules;
 			}
 		}
 	}
@@ -728,10 +754,10 @@ class LocalisationCache {
 		foreach ( self::$splitKeys as $key ) {
 			$allData['list'][$key] = array_keys( $allData[$key] );
 		}
-		# Load CLDR plural rules
-		if ( isset( $this->pluralRules[$code] ) ) {
-			$allData['pluralRules'] = $this->pluralRules[$code];
-		}
+		# Load CLDR plural rules for JavaScript
+		$allData['pluralRules'] = $this->getPluralRules( $code );
+		# And for PHP
+		$allData['compiledPluralRules'] = $this->getCompiledPluralRules( $code );
 		# Run hooks
 		wfRunHooks( 'LocalisationCacheRecache', array( $this, $code, &$allData ) );
 
diff --git a/languages/Language.php b/languages/Language.php
index e67c086518..d1a38bb575 100644
--- a/languages/Language.php
+++ b/languages/Language.php
@@ -4190,6 +4190,15 @@ class Language {
 		return $this->mConverter->getConvRuleTitle();
 	}
 
+	/**
+	 * Get the compiled plural rules for the language
+	 * @since 1.20
+	 * @return array Associative array with plural form, and plural rule as key-value pairs
+	 */
+	public function getCompiledPluralRules() {
+		return self::$dataCache->getItem( strtolower( $this->mCode ), 'compiledPluralRules' );
+	}
+
 	/**
 	 * Get the plural rules for the language
 	 * @since 1.20
@@ -4205,8 +4214,8 @@ class Language {
 	 * @return int The index of the plural form
 	 */
 	private function getPluralForm( $number ) {
-		$pluralRules = $this->getPluralRules();
-		$form = CLDRPluralRuleEvaluator::evaluate( $number, $pluralRules );
+		$pluralRules = $this->getCompiledPluralRules();
+		$form = CLDRPluralRuleEvaluator::evaluateCompiled( $number, $pluralRules );
 		return $form;
 	}
 
diff --git a/languages/utils/CLDRPluralRuleEvaluator.php b/languages/utils/CLDRPluralRuleEvaluator.php
index f420e41b58..6b11704376 100644
--- a/languages/utils/CLDRPluralRuleEvaluator.php
+++ b/languages/utils/CLDRPluralRuleEvaluator.php
@@ -1,12 +1,15 @@
 <?php
 /**
- * Parse and evaluate a plural rule
+ * Parse and evaluate a plural rule.
  *
- * @author Niklas Laxstrom
+ * http://unicode.org/reports/tr35/#Language_Plural_Rules
+ *
+ * @author Niklas Laxstrom, Tim Starling
  *
  * @copyright Copyright Â© 2010-2012, Niklas LaxstrÃ¶m
  * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
  * @file
+ * @since 1.20
  */
 
 class CLDRPluralRuleEvaluator {
@@ -18,55 +21,554 @@ class CLDRPluralRuleEvaluator {
 	 * @param array The associative array of plural rules in pluralform => rule format.
 	 * @return int The index of the plural form which passed the evaluation
 	 */
-	public static function evaluate( $number, $rules ) {
-		$formIndex = 0;
-		if ( !$rules ) {
-			return 0;
+	public static function evaluate( $number, array $rules ) {
+		$rules = self::compile( $rules );
+		return self::evaluateCompiled( $number, $rules );
+	}
+
+	/**
+	 * Convert a set of rules to a compiled form which is optimised for
+	 * fast evaluation. The result will be an array of strings, and may be cached.
+	 *
+	 * @param $rules The rules to compile
+	 * @return An array of compile rules.
+	 */
+	public static function compile( array $rules ) {
+		// We can't use array_map() for this because it generates a warning if
+		// there is an exception.
+		foreach ( $rules as &$rule ) {
+			$rule = CLDRPluralRuleConverter::convert( $rule );
 		}
-		foreach ( $rules as $form => $rule ) {
-			$parsedRule = self::parseCLDRRule( $rule, $number );
-			// FIXME eval is bad.
-			if ( eval( "return $parsedRule;" ) ) {
-				return $formIndex;
+		return $rules;
+	}
+
+	/**
+	 * Evaluate a compiled set of rules returned by compile(). Do not allow
+	 * the user to edit the compiled form, or else PHP errors may result.
+	 */
+	public static function evaluateCompiled( $number, array $rules ) {
+		// The compiled form is RPN, with tokens strictly delimited by
+		// spaces, so this is a simple RPN evaluator.
+		foreach ( $rules as $i => $rule  ) {
+			$stack = array();
+			$zero = ord( '0' );
+			$nine = ord( '9' );
+			foreach ( StringUtils::explode( ' ', $rule ) as $token ) {
+				$ord = ord( $token );
+				if ( $token === 'n' ) {
+					$stack[] = $number;
+				} elseif ( $ord >= $zero && $ord <= $nine ) {
+					$stack[] = intval( $token );
+				} else {
+					$right = array_pop( $stack );
+					$left = array_pop( $stack );
+					$result = self::doOperation( $token, $left, $right );
+					$stack[] = $result;
+				}
+			}
+			if ( $stack[0] ) {
+				return $i;
 			}
-			$formIndex++;
 		}
-		return $formIndex;
-	}
-	private static function parseCLDRRule( $rule ) {
-		$rule = preg_replace( '/\bn\b/', '$number', $rule );
-		$rule = preg_replace( '/([^ ]+) mod (\d+)/', 'self::mod(\1,\2)', $rule );
-		$rule = preg_replace( '/([^ ]+) is not (\d+)/' , '\1!=\2', $rule );
-		$rule = preg_replace( '/([^ ]+) is (\d+)/', '\1==\2', $rule );
-		$rule = preg_replace( '/([^ ]+) not in (\d+)\.\.(\d+)/', '!self::in(\1,\2,\3)', $rule );
-		$rule = preg_replace( '/([^ ]+) not within (\d+)\.\.(\d+)/', '!self::within(\1,\2,\3)', $rule );
-		$rule = preg_replace( '/([^ ]+) in (\d+)\.\.(\d+)/', 'self::in(\1,\2,\3)', $rule );
-		$rule = preg_replace( '/([^ ]+) within (\d+)\.\.(\d+)/', 'self::within(\1,\2,\3)', $rule );
-		// AND takes precedence over OR
-		$andrule = '/([^ ]+) and ([^ ]+)/i';
-		while ( preg_match( $andrule, $rule ) ) {
-			$rule = preg_replace( $andrule, '(\1&&\2)', $rule );
+		// None of the provided rules match. The number belongs to caregory
+		// 'other' which comes last.
+		return count( $rules );
+	}
+
+	/**
+	 * Do a single operation
+	 *
+	 * @param $token string The token string
+	 * @param $left The left operand. If it is an object, its state may be destroyed.
+	 * @param $right The right operand
+	 * @return mixed
+	 */
+	private static function doOperation( $token, $left, $right ) {
+		if ( in_array( $token, array( 'in', 'not-in', 'within', 'not-within' ) ) ) {
+			if ( !($right instanceof CLDRPluralRuleEvaluator_Range ) ) {
+				$right = new CLDRPluralRuleEvaluator_Range( $right );
+			}
 		}
-		$orrule = '/([^ ]+) or ([^ ]+)/i';
-		while ( preg_match( $orrule, $rule ) ) {
-			$rule = preg_replace( $orrule, '(\1||\2)', $rule );
+		switch ( $token ) {
+			case 'or':
+				return $left || $right;
+			case 'and':
+				return $left && $right;
+			case 'is':
+				return $left == $right;
+			case 'is-not':
+				return $left != $right;
+			case 'in':
+				return $right->isNumberIn( $left );
+			case 'not-in':
+				return !$right->isNumberIn( $left );
+			case 'within':
+				return $right->isNumberWithin( $left );
+			case 'not-within':
+				return !$right->isNumberWithin( $left );
+			case 'mod':
+				if ( is_int( $left ) ) {
+					return (int) fmod( $left, $right );
+				}
+				return fmod( $left, $right );
+			case ',':
+				if ( $left instanceof CLDRPluralRuleEvaluator_Range ) {
+					$range = $left;
+				} else {
+					$range = new CLDRPluralRuleEvaluator_Range( $left );
+				}
+				$range->add( $right );
+				return $range;
+			case '..':
+				return new CLDRPluralRuleEvaluator_Range( $left, $right );
+			default:
+				throw new CLDRPluralRuleError( "Invalid RPN token" );
 		}
+	}
+}
+
+/**
+ * Evaluator helper class representing a range list.
+ */
+class CLDRPluralRuleEvaluator_Range {
+	var $parts = array();
 
-		return $rule;
+	function __construct( $start, $end = false ) {
+		if ( $end === false ) {
+			$this->parts[] = $start;
+		} else {
+			$this->parts[] = array( $start, $end );
+		}
 	}
 
-	private static function in( $num, $low, $high ) {
-		return is_int( $num ) && $num >= $low && $num <= $high;
+	/**
+	 * Determine if the given number is inside the range. If $integerConstraint
+	 * is true, the number must additionally be an integer if it is to match
+	 * any interval part.
+	 */
+	function isNumberIn( $number, $integerConstraint = true ) {
+		foreach ( $this->parts as $part ) {
+			if ( is_array( $part ) ) {
+				if ( ( !$integerConstraint || floor( $number ) === (float)$number )
+					&& $number >= $part[0] && $number <= $part[1] )
+				{
+					return true;
+				}
+			} else {
+				if ( $number == $part ) {
+					return true;
+				}
+			}
+		}
+		return false;
 	}
 
-	private static function within( $num, $low, $high ) {
-		return $num >= $low && $num <= $high;
+	/**
+	 * Readable alias for isNumberIn( $number, false ), and the implementation
+	 * of the "within" operator.
+	 */
+	function isNumberWithin( $number ) {
+		return $this->isNumberIn( $number, false );
 	}
 
-	private static function mod( $num, $mod ) {
-		if ( is_int( $num ) ) {
-			return (int) fmod( $num, $mod );
+	/**
+	 * Add another part to this range. The supplied new part may either be a
+	 * range object itself, or a single number.
+	 */
+	function add( $other ) {
+		if ( $other instanceof self ) {
+			$this->parts = array_merge( $this->parts, $other->parts );
+		} else {
+			$this->parts[] = $other;
 		}
-		return fmod( $num, $mod );
+	}
+
+	/**
+	 * For debugging
+	 */
+	function __toString() {
+		$s = 'Range(';
+		foreach ( $this->parts as $i => $part ) {
+			if ( $i ) {
+				$s .= ', ';
+			}
+			if ( is_array( $part ) ) {
+				$s .= $part[0] . '..' . $part[1];
+			} else {
+				$s .= $part;
+			}
+		}
+		$s .= ')';
+		return $s;
+	}
+
+}
+
+/**
+ * Helper class for converting rules to reverse polish notation (RPN).
+ */
+class CLDRPluralRuleConverter {
+	var $rule, $pos, $end;
+	var $operators = array();
+	var $operands = array();
+
+	/**
+	 * Precedence levels. Note that there's no need to worry about associativity
+	 * for the level 4 operators, since they return boolean and don't accept
+	 * boolean inputs.
+	 */
+	static $precedence = array(
+		'or' => 2,
+		'and' => 3,
+		'is' => 4,
+		'is-not' => 4,
+		'in' => 4,
+		'not-in' => 4,
+		'within' => 4,
+		'not-within' => 4,
+		'mod' => 5,
+		',' => 6,
+		'..' => 7,
+	);
+
+	/**
+	 * A character list defining whitespace, for use in strspn() etc.
+	 */
+	const WHITESPACE_CLASS = " \t\r\n";
+
+	/**
+	 * Same for digits. Note that the grammar given in UTS #35 doesn't allow
+	 * negative numbers or decimals.
+	 */
+	const NUMBER_CLASS = '0123456789';
+
+	/**
+	 * An anchored regular expression which matches a word at the current offset.
+	 */
+	const WORD_REGEX = '/[a-zA-Z]+/A';
+
+	/**
+	 * Convert a rule to RPN. This is the only public entry point.
+	 */
+	public static function convert( $rule ) {
+		$parser = new self( $rule );
+		return $parser->doConvert();
+	}
+
+	/**
+	 * Private constructor.
+	 */
+	protected function __construct( $rule ) {
+		$this->rule = $rule;
+		$this->pos = 0;
+		$this->end = strlen( $rule );
+	}
+
+	/**
+	 * Do the operation.
+	 */
+	protected function doConvert() {
+		$expectOperator = true;
+
+		// Iterate through all tokens, saving the operators and operands to a
+		// stack per Dijkstra's shunting yard algorithm.
+		while ( false !== ( $token = $this->nextToken() ) ) {
+			// In this grammar, there are only binary operators, so every valid
+			// rule string will alternate between operator and operand tokens.
+			$expectOperator = !$expectOperator;
+
+			if ( $token instanceof CLDRPluralRuleConverter_Expression ) {
+				// Operand
+				if ( $expectOperator ) {
+					$token->error( 'unexpected operand' );
+				}
+				$this->operands[] = $token;
+				continue;
+			} else {
+				// Operator
+				if  ( !$expectOperator ) {
+					$token->error( 'unexpected operator' );
+				}
+				// Resolve higher precedence levels
+				$lastOp = end( $this->operators );
+				while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) {
+					$this->doOperation( $lastOp, $this->operands );
+					array_pop( $this->operators );
+					$lastOp = end( $this->operators );
+				}
+				$this->operators[] = $token;
+			}
+		}
+
+		// Finish off the stack
+		while ( $op = array_pop( $this->operators ) ) {
+			$this->doOperation( $op, $this->operands );
+		}
+
+		// Make sure the result is sane. The first case is possible for an empty
+		// string input, the second should be unreachable.
+		if ( !count( $this->operands ) ) {
+			$this->error( 'condition expected' );
+		} elseif ( count( $this->operands ) > 1 ) {
+			$this->error( 'missing operator or too many operands' );
+		}
+
+		$value = $this->operands[0];
+		if ( $value->type !== 'boolean' ) {
+			$this->error( 'the result must have a boolean type' );
+		}
+
+		return $this->operands[0]->rpn;
+	}
+
+	/**
+	 * Fetch the next token from the input string. Return it as a
+	 * CLDRPluralRuleConverter_Fragment object.
+	 */
+	protected function nextToken() {
+		if ( $this->pos >= $this->end ) {
+			return false;
+		}
+
+		// Whitespace
+		$length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos );
+		$this->pos += $length;
+
+		if ( $this->pos >= $this->end ) {
+			return false;
+		}
+
+		// Number
+		$length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos );
+		if ( $length !== 0 ) {
+			$token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos );
+			$this->pos += $length;
+			return $token;
+		}
+
+		// Comma
+		if ( $this->rule[$this->pos] === ',' ) {
+			$token = $this->newOperator( ',', $this->pos, 1 );
+			$this->pos ++;
+			return $token;
+		}
+
+		// Dot dot
+		if ( substr( $this->rule, $this->pos, 2 ) === '..' ) {
+			$token = $this->newOperator( '..', $this->pos, 2 );
+			$this->pos += 2;
+			return $token;
+		}
+
+		// Word
+		if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) {
+			$this->error( 'unexpected character "' . $this->rule[$this->pos] . '"'  );
+		}
+		$word1 = strtolower( $m[0] );
+		$word2 = '';
+		$nextTokenPos = $this->pos + strlen( $word1 );
+		if ( $word1 === 'not' || $word1 === 'is' ) {
+			// Look ahead one word
+			$nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos );
+			if ( $nextTokenPos < $this->end
+					&& preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos ) )
+			{
+				$word2 = strtolower( $m[0] );
+				$nextTokenPos += strlen( $word2 );
+			}
+		}
+
+		// Two-word operators like "is not" take precedence over single-word operators like "is"
+		if ( $word2 !== '' ) {
+			$bothWords = "{$word1}-{$word2}";
+			if ( isset( self::$precedence[$bothWords] ) ) {
+				$token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos );
+				$this->pos = $nextTokenPos;
+				return $token;
+			}
+		}
+
+		// Single-word operators
+		if ( isset( self::$precedence[$word1] ) ) {
+			$token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) );
+			$this->pos += strlen( $word1 );
+			return $token;
+		}
+
+		// The special numerical keyword "n"
+		if ( $word1 === 'n' ) {
+			$token = $this->newNumber( 'n', $this->pos );
+			$this->pos ++;
+			return $token;
+		}
+
+		$this->error( 'unrecognised word' );
+	}
+
+	/**
+	 * For the binary operator $op, pop its operands off the stack and push
+	 * a fragment with rpn and type members describing the result of that
+	 * operation.
+	 */
+	protected function doOperation( $op ) {
+		if ( count( $this->operands ) < 2 ) {
+			$op->error( 'missing operand' );
+		}
+		$right = array_pop( $this->operands );
+		$left = array_pop( $this->operands );
+		$result = $op->operate( $left, $right );
+		$this->operands[] = $result;
+	}
+
+	/**
+	 * Create a numerical expression object
+	 */
+	protected function newNumber( $text, $pos ) {
+		return new CLDRPluralRuleConverter_Expression( $this, 'number', $text, $pos, strlen( $text ) );
+	}
+
+	/**
+	 * Create a binary operator
+	 */
+	protected function newOperator( $type, $pos, $length ) {
+		return new CLDRPluralRuleConverter_Operator( $this, $type, $pos, $length );
+	}
+
+	/**
+	 * Throw an error
+	 */
+	protected function error( $message ) {
+		throw new CLDRPluralRuleError( $message );
+	}
+}
+
+/**
+ * Helper for CLDRPluralRuleConverter.
+ * The base class for operators and expressions, describing a region of the input string.
+ */
+class CLDRPluralRuleConverter_Fragment {
+	var $parser, $pos, $length, $end;
+
+	function __construct( $parser, $pos, $length ) {
+		$this->parser = $parser;
+		$this->pos = $pos;
+		$this->length = $length;
+		$this->end = $pos + $length;
+	}
+
+	public function error( $message ) {
+		$text = $this->getText();
+		throw new CLDRPluralRuleError( "$message at position " . ( $this->pos + 1 ) . ": \"$text\"" );
+	}
+
+	public function getText() {
+		return substr( $this->parser->rule, $this->pos, $this->length );
+	}
+}
+
+/**
+ * Helper for CLDRPluralRuleConverter.
+ * An expression object, representing a region of the input string (for error
+ * messages), the RPN notation used to evaluate it, and the result type for
+ * validation.
+ */
+class CLDRPluralRuleConverter_Expression extends CLDRPluralRuleConverter_Fragment {
+	var $type, $rpn;
+
+	function __construct( $parser, $type, $rpn, $pos, $length ) {
+		parent::__construct( $parser, $pos, $length );
+		$this->type = $type;
+		$this->rpn = $rpn;
+	}
+
+	public function isType( $type ) {
+		if ( $type === 'range' && ( $this->type === 'range' || $this->type === 'number' ) ) {
+			return true;
+		}
+		if ( $type === $this->type ) {
+			return true;
+		}
+		return false;
+	}
+}
+
+/**
+ * Helper for CLDRPluralRuleConverter.
+ * An operator object, representing a region of the input string (for error
+ * messages), and the binary operator at that location.
+ */
+class CLDRPluralRuleConverter_Operator extends CLDRPluralRuleConverter_Fragment {
+	var $name;
+
+	/**
+	 * Each op type has three characters: left operand type, right operand type and result type
+	 *
+	 *   b = boolean
+	 *   n = number
+	 *   r = range
+	 *
+	 * A number is a kind of range.
+	 */
+	static $opTypes = array(
+		'or' => 'bbb',
+		'and' => 'bbb',
+		'is' => 'nnb',
+		'is-not' => 'nnb',
+		'in' => 'nrb',
+		'not-in' => 'nrb',
+		'within' => 'nrb',
+		'not-within' => 'nrb',
+		'mod' => 'nnn',
+		',' => 'rrr',
+		'..' => 'nnr',
+	);
+
+	/**
+	 * Map converting from the abbrevation to the full form.
+	 */
+	static $typeSpecMap = array(
+		'b' => 'boolean',
+		'n' => 'number',
+		'r' => 'range',
+	);
+
+	function __construct( $parser, $name, $pos, $length ) {
+		parent::__construct( $parser, $pos, $length );
+		$this->name = $name;
+	}
+
+	public function operate( $left, $right ) {
+		$typeSpec = self::$opTypes[$this->name];
+
+		$leftType = self::$typeSpecMap[$typeSpec[0]];
+		$rightType = self::$typeSpecMap[$typeSpec[1]];
+		$resultType = self::$typeSpecMap[$typeSpec[2]];
+
+		$start = min( $this->pos, $left->pos, $right->pos );
+		$end = max( $this->end, $left->end, $right->end );
+		$length = $end - $start;
+
+		$newExpr = new CLDRPluralRuleConverter_Expression( $this->parser, $resultType,
+			"{$left->rpn} {$right->rpn} {$this->name}",
+			$start, $length );
+
+		if ( !$left->isType( $leftType ) ) {
+			$newExpr->error( "invalid type for left operand: expected $leftType, got {$left->type}" );
+		}
+
+		if ( !$right->isType( $rightType ) ) {
+			$newExpr->error( "invalid type for right operand: expected $rightType, got {$right->type}" );
+		}
+		return $newExpr;
+	}
+}
+
+/**
+ * The exception class for all the classes in this file. This will be thrown
+ * back to the caller if there is any validation error.
+ */
+class CLDRPluralRuleError extends MWException {
+	function __construct( $message ) {
+		parent::__construct( 'CLDR plural rule error: ' . $message );
 	}
 }
diff --git a/tests/phpunit/languages/utils/CLDRPluralRuleEvaluatorTest.php b/tests/phpunit/languages/utils/CLDRPluralRuleEvaluatorTest.php
new file mode 100644
index 0000000000..033164b026
--- /dev/null
+++ b/tests/phpunit/languages/utils/CLDRPluralRuleEvaluatorTest.php
@@ -0,0 +1,95 @@
+<?php
+/**
+ * @author Niklas LaxstrÃ¶m
+ * @file
+ */
+
+class CLDRPluralRuleEvaluatorTest extends MediaWikiTestCase {
+	/**
+	 * @dataProvider validTestCases
+	 */
+	function testValidRules( $expected, $rules, $number, $comment ) {
+		$result = CLDRPluralRuleEvaluator::evaluate( $number, (array) $rules );
+		$this->assertEquals( $expected, $result, $comment );
+	}
+
+	/**
+	 * @dataProvider invalidTestCases
+	 * @expectedException CLDRPluralRuleError
+	 */
+	function testInvalidRules( $rules, $comment ) {
+		CLDRPluralRuleEvaluator::evaluate( 1, (array) $rules );
+	}
+
+	function validTestCases() {
+		$tests = array(
+			# expected, number, rule, comment
+			array( 0, 'n is 1', 1, 'integer number and is' ),
+			array( 0, 'n is 1', "1", 'string integer number and is' ),
+			array( 0, 'n is 1', 1.0, 'float number and is' ),
+			array( 0, 'n is 1', "1.0", 'string float number and is' ),
+			array( 1, 'n is 1', 1.1, 'float number and is' ),
+			array( 1, 'n is 1', 2, 'float number and is' ),
+
+			array( 0, 'n in 1,3,5',     3, '' ),
+			array( 1, 'n not in 1,3,5', 5, '' ),
+
+			array( 1, 'n in 1,3,5',     2, '' ),
+			array( 0, 'n not in 1,3,5', 4, '' ),
+
+			array( 0, 'n in 1..3',      2, '' ),
+			array( 0, 'n in 1..3',      3, 'in is inclusive' ),
+			array( 1, 'n in 1..3',      0, '' ),
+
+			array( 1, 'n not in 1..3',      2, '' ),
+			array( 1, 'n not in 1..3',      3, 'in is inclusive' ),
+			array( 0, 'n not in 1..3',      0, '' ),
+
+			array( 1, 'n is not 1 and n is not 2 and n is not 3', 1, 'and relation' ),
+			array( 0, 'n is not 1 and n is not 2 and n is not 4', 3, 'and relation' ),
+
+			array( 0, 'n is not 1 or n is 1', 1, 'or relation' ),
+			array( 1, 'n is 1 or n is 2', 3, 'or relation' ),
+
+			array( 0, 'n              is      1', 1, 'extra whitespace' ),
+
+			array( 0, 'n mod 3 is 1', 7, 'mod' ),
+			array( 0, 'n mod 3 is not 1', 4.3, 'mod with floats' ),
+
+			array( 0, 'n within 1..3', 2, 'within with integer' ),
+			array( 0, 'n within 1..3', 2.5, 'within with float' ),
+			array( 0, 'n in 1..3', 2, 'in with integer' ),
+			array( 1, 'n in 1..3', 2.5, 'in with float' ),
+
+			array( 0, 'n in 3 or n is 4 and n is 5', 3, 'and binds more tightly than or' ),
+			array( 1, 'n is 3 or n is 4 and n is 5', 4, 'and binds more tightly than or' ),
+
+			array( 0, 'n mod 10 in 3..4,9 and n mod 100 not in 10..19,70..79,90..99', 24, 'breton rule' ),
+			array( 1, 'n mod 10 in 3..4,9 and n mod 100 not in 10..19,70..79,90..99', 25, 'breton rule' ),
+
+			array( 0, 'n within 0..2 and n is not 2', 0, 'french rule' ),
+			array( 0, 'n within 0..2 and n is not 2', 1, 'french rule' ),
+			array( 0, 'n within 0..2 and n is not 2', 1.2, 'french rule' ),
+			array( 1, 'n within 0..2 and n is not 2', 2, 'french rule' ),
+
+			array( 1, 'n in 3..10,13..19', 2, 'scottish rule - ranges with comma' ),
+			array( 0, 'n in 3..10,13..19', 4, 'scottish rule - ranges with comma' ),
+			array( 1, 'n in 3..10,13..19', 12.999, 'scottish rule - ranges with comma' ),
+			array( 0, 'n in 3..10,13..19', 13, 'scottish rule - ranges with comma' ),
+
+			array( 0, '5 mod 3 is n', 2, 'n as result of mod - no need to pass' ),
+		);
+
+		return $tests;
+	}
+
+	function invalidTestCases() {
+		$tests = array(
+			array( 'n mod mod 5 is 1', 'mod mod' ),
+			array( 'n', 'just n' ),
+			array( 'n is in 5', 'is in' ),
+		);
+		return $tests;
+	}
+
+}