use MediaWiki\MediaWikiServices;
use MediaWiki\Logger\LoggerFactory;
+use MediaWiki\Storage\RevisionRecord;
/**
* Base class for language conversion.
* @since 1.20
* @var array
*/
- static public $languagesWithVariants = [
+ public static $languagesWithVariants = [
'en',
'crh',
'gan',
public $mVariantFallbacks;
public $mVariantNames;
public $mTablesLoaded = false;
+
+ /**
+ * @var ReplacementArray[]
+ * @phan-var array<string,ReplacementArray>
+ */
public $mTables;
+
// 'bidirectional' 'unidirectional' 'disable' for each variant
public $mManualLevel;
IMPORTANT: Beware of failure from pcre.backtrack_limit (T124404).
Minimize use of backtracking where possible.
*/
- $marker = '|' . Parser::MARKER_PREFIX . '[^\x7f]++\x7f';
-
- // this one is needed when the text is inside an HTML markup
- $htmlfix = '|<[^>\004]++(?=\004$)|^[^<>]*+>';
-
- // Optimize for the common case where these tags have
- // few or no children. Thus try and possesively get as much as
- // possible, and only engage in backtracking when we hit a '<'.
-
- // disable convert to variants between <code> tags
- $codefix = '<code>[^<]*+(?:(?:(?!<\/code>).)[^<]*+)*+<\/code>|';
- // disable conversion of <script> tags
- $scriptfix = '<script[^>]*+>[^<]*+(?:(?:(?!<\/script>).)[^<]*+)*+<\/script>|';
- // disable conversion of <pre> tags
- $prefix = '<pre[^>]*+>[^<]*+(?:(?:(?!<\/pre>).)[^<]*+)*+<\/pre>|';
- // The "|.*+)" at the end, is in case we missed some part of html syntax,
- // we will fail securely (hopefully) by matching the rest of the string.
- $htmlFullTag = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)|';
-
- $reg = '/' . $codefix . $scriptfix . $prefix . $htmlFullTag .
- '&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
+ static $reg;
+ if ( $reg === null ) {
+ $marker = '|' . Parser::MARKER_PREFIX . '[^\x7f]++\x7f';
+
+ // this one is needed when the text is inside an HTML markup
+ $htmlfix = '|<[^>\004]++(?=\004$)|^[^<>]*+>';
+
+ // Optimize for the common case where these tags have
+ // few or no children. Thus try and possesively get as much as
+ // possible, and only engage in backtracking when we hit a '<'.
+
+ // disable convert to variants between <code> tags
+ $codefix = '<code>[^<]*+(?:(?:(?!<\/code>).)[^<]*+)*+<\/code>|';
+ // disable conversion of <script> tags
+ $scriptfix = '<script[^>]*+>[^<]*+(?:(?:(?!<\/script>).)[^<]*+)*+<\/script>|';
+ // disable conversion of <pre> tags
+ $prefix = '<pre[^>]*+>[^<]*+(?:(?:(?!<\/pre>).)[^<]*+)*+<\/pre>|';
+ // The "|.*+)" at the end, is in case we missed some part of html syntax,
+ // we will fail securely (hopefully) by matching the rest of the string.
+ $htmlFullTag = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)|';
+
+ $reg = '/' . $codefix . $scriptfix . $prefix . $htmlFullTag .
+ '&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
+ }
$startPos = 0;
$sourceBlob = '';
$literalBlob = '';
// We add a marker (\004) at the end of text, to ensure we always match the
// entire text (Otherwise, pcre.backtrack_limit might cause silent failure)
+ $textWithMarker = $text . "\004";
while ( $startPos < strlen( $text ) ) {
- if ( preg_match( $reg, $text . "\004", $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
+ if ( preg_match( $reg, $textWithMarker, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
$elementPos = $markupMatches[0][1];
$element = $markupMatches[0][0];
if ( $element === "\004" ) {
$revision = Revision::newFromTitle( $title );
if ( $revision ) {
if ( $revision->getContentModel() == CONTENT_MODEL_WIKITEXT ) {
- $txt = $revision->getContent( Revision::RAW )->getNativeData();
+ $txt = $revision->getContent( RevisionRecord::RAW )->getText();
}
// @todo in the future, use a specialized content model, perhaps based on json!
// [1] => 'zh-hant:<span style="font-size:120%;">yyy</span>'
// [2] => ''
// ]
- $pat = '/;\s*(?=';
+ $expandedVariants = [];
foreach ( $this->mVariants as $variant ) {
+ $expandedVariants[ $variant ] = 1;
+ // Accept standard BCP 47 names for variants as well.
+ $expandedVariants[ LanguageCode::bcp47( $variant ) ] = 1;
+ }
+ // Accept old deprecated names for variants
+ foreach ( LanguageCode::getDeprecatedCodeMapping() as $old => $new ) {
+ if ( isset( $expandedVariants[ $new ] ) ) {
+ $expandedVariants[ $old ] = 1;
+ }
+ }
+
+ $pat = '/;\s*(?=';
+ foreach ( $expandedVariants as $variant => $ignore ) {
// zh-hans:xxx;zh-hant:yyy
$pat .= $variant . '\s*:|';
// xxx=>zh-hans:yyy; xxx=>zh-hant:zzz