'CachedBagOStuff' => __DIR__ . '/includes/libs/objectcache/CachedBagOStuff.php',
'CachingSiteStore' => __DIR__ . '/includes/site/CachingSiteStore.php',
'CapsCleanup' => __DIR__ . '/maintenance/cleanupCaps.php',
+ 'CategoriesRdf' => __DIR__ . '/includes/CategoriesRdf.php',
'Category' => __DIR__ . '/includes/Category.php',
'CategoryFinder' => __DIR__ . '/includes/CategoryFinder.php',
'CategoryMembershipChange' => __DIR__ . '/includes/changes/CategoryMembershipChange.php',
'Dump7ZipOutput' => __DIR__ . '/includes/export/Dump7ZipOutput.php',
'DumpBZip2Output' => __DIR__ . '/includes/export/DumpBZip2Output.php',
'DumpBackup' => __DIR__ . '/maintenance/dumpBackup.php',
+ 'DumpCategoriesAsRdf' => __DIR__ . '/maintenance/dumpCategoriesAsRdf.php',
'DumpDBZip2Output' => __DIR__ . '/includes/export/DumpDBZip2Output.php',
'DumpFileOutput' => __DIR__ . '/includes/export/DumpFileOutput.php',
'DumpFilter' => __DIR__ . '/includes/export/DumpFilter.php',
"wikimedia/html-formatter": "1.0.1",
"wikimedia/ip-set": "1.1.0",
"wikimedia/php-session-serializer": "1.0.4",
+ "wikimedia/purtle": "1.0.6",
"wikimedia/relpath": "2.0.0",
"wikimedia/remex-html": "1.0.1",
"wikimedia/running-stat": "1.1.0",
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!DOCTYPE rdf:RDF [
+ <!ENTITY xsd "http://www.w3.org/2001/XMLSchema#">
+ <!ENTITY rdf "http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+ <!ENTITY rdfs "http://www.w3.org/2000/01/rdf-schema#">
+ <!ENTITY owl "http://www.w3.org/2002/07/owl#">
+ <!ENTITY mediawiki "https://www.mediawiki.org/ontology#">
+]>
+
+<rdf:RDF
+ xmlns:xsd="&xsd;"
+ xmlns:rdf="&rdf;"
+ xmlns:rdfs="&rdfs;"
+ xmlns:owl="&owl;"
+>
+
+ <owl:Ontology rdf:about="&mediawiki;">
+ <rdfs:label>MediaWiki ontology</rdfs:label>
+ <rdfs:comment>The ontology of MediaWiki</rdfs:comment>
+ </owl:Ontology>
+
+ <!--
+ ///////////////////////////////////////////////////////////////////////////////////////
+ //
+ // Classes
+ //
+ ///////////////////////////////////////////////////////////////////////////////////////
+ -->
+
+ <owl:Class rdf:about="&mediawiki;Dump">
+ <rdfs:label>Dump</rdfs:label>
+ <rdfs:comment>A dump of MediaWiki content.</rdfs:comment>
+ </owl:Class>
+
+ <owl:Class rdf:about="&mediawiki;Category">
+ <rdfs:label>Category</rdfs:label>
+ <rdfs:comment>MediaWiki category.</rdfs:comment>
+ </owl:Class>
+
+ <!--
+ ///////////////////////////////////////////////////////////////////////////////////////
+ //
+ // Properties
+ //
+ ///////////////////////////////////////////////////////////////////////////////////////
+ -->
+
+ <owl:ObjectProperty rdf:about="&mediawiki;isInCategory">
+ <rdfs:label>isInCategory</rdfs:label>
+ <rdfs:comment>One category is the parent of another.</rdfs:comment>
+ <rdfs:range rdf:resource="&mediawiki;Category"/>
+ <rdfs:domain rdf:resource="&mediawiki;Category"/>
+ </owl:ObjectProperty>
+
+</rdf:RDF>
--- /dev/null
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+use Wikimedia\Purtle\RdfWriter;
+
+/**
+ * Helper class to produce RDF representation of categories.
+ */
+class CategoriesRdf {
+ /**
+ * Prefix used for Mediawiki ontology in the dump.
+ */
+ const ONTOLOGY_PREFIX = 'mediawiki';
+ /**
+ * Base URL for Mediawiki ontology.
+ */
+ const ONTOLOGY_URL = 'https://www.mediawiki.org/ontology#';
+ /**
+ * OWL description of the ontology.
+ */
+ const OWL_URL = 'https://www.mediawiki.org/ontology/ontology.owl';
+ /**
+ * Current version of the dump format.
+ */
+ const FORMAT_VERSION = "1.0";
+ /**
+ * @var RdfWriter
+ */
+ private $rdfWriter;
+
+ public function __construct( RdfWriter $writer ) {
+ $this->rdfWriter = $writer;
+ }
+
+ /**
+ * Setup prefixes relevant for the dump
+ */
+ public function setupPrefixes() {
+ $this->rdfWriter->prefix( self::ONTOLOGY_PREFIX, self::ONTOLOGY_URL );
+ $this->rdfWriter->prefix( 'rdfs', 'http://www.w3.org/2000/01/rdf-schema#' );
+ $this->rdfWriter->prefix( 'owl', 'http://www.w3.org/2002/07/owl#' );
+ $this->rdfWriter->prefix( 'schema', 'http://schema.org/' );
+ $this->rdfWriter->prefix( 'cc', 'http://creativecommons.org/ns#' );
+ }
+
+ /**
+ * Write RDF data for link between categories.
+ * @param string $fromName Child category name
+ * @param string $toName Parent category name
+ */
+ public function writeCategoryLinkData( $fromName, $toName ) {
+ $titleFrom = Title::makeTitle( NS_CATEGORY, $fromName );
+ $titleTo = Title::makeTitle( NS_CATEGORY, $toName );
+ $this->rdfWriter->about( $this->titleToUrl( $titleFrom ) )
+ ->say( self::ONTOLOGY_PREFIX, 'isInCategory' )
+ ->is( $this->titleToUrl( $titleTo ) );
+ }
+
+ /**
+ * Write out the data for single category.
+ * @param string $categoryName Category name
+ */
+ public function writeCategoryData( $categoryName ) {
+ $title = Title::makeTitle( NS_CATEGORY, $categoryName );
+ $this->rdfWriter->about( $this->titleToUrl( $title ) )
+ ->say( 'a' )
+ ->is( self::ONTOLOGY_PREFIX, 'Category' );
+ $titletext = $title->getText();
+ $this->rdfWriter->say( 'rdfs', 'label' )->value( $titletext );
+ }
+
+ /**
+ * Convert Title to link to target page.
+ * @param Title $title
+ * @return string
+ */
+ private function titleToUrl( Title $title ) {
+ return $title->getFullURL( '', false, PROTO_CANONICAL );
+ }
+}
--- /dev/null
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+use Wikimedia\Purtle\RdfWriter;
+use Wikimedia\Purtle\RdfWriterFactory;
+use Wikimedia\Rdbms\IDatabase;
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script to provide RDF representation of the category tree.
+ *
+ * @ingroup Maintenance
+ * @since 1.30
+ */
+class DumpCategoriesAsRdf extends Maintenance {
+ /**
+ * @var RdfWriter
+ */
+ private $rdfWriter;
+ /**
+ * Categories RDF helper.
+ * @var CategoriesRdf
+ */
+ private $categoriesRdf;
+
+ public function __construct() {
+ parent::__construct();
+
+ $this->addDescription( "Generate RDF dump of categories in a wiki." );
+
+ $this->setBatchSize( 200 );
+ $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
+ false, true );
+ $this->addOption( 'format', "Set the dump format.", false, true );
+ }
+
+ /**
+ * Produce row iterator for categories.
+ * @param IDatabase $dbr Database connection
+ * @return RecursiveIterator
+ */
+ public function getCategoryIterator( IDatabase $dbr ) {
+ $it = new BatchRowIterator(
+ $dbr,
+ 'page',
+ [ 'page_title' ],
+ $this->mBatchSize
+ );
+ $it->addConditions( [
+ 'page_namespace' => NS_CATEGORY,
+ ] );
+ $it->setFetchColumns( [ 'page_title', 'page_id' ] );
+ return $it;
+ }
+
+ /**
+ * Get iterator for links for categories.
+ * @param IDatabase $dbr
+ * @param array $ids List of page IDs
+ * @return Traversable
+ */
+ public function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
+ $it = new BatchRowIterator(
+ $dbr,
+ 'categorylinks',
+ [ 'cl_from', 'cl_to' ],
+ $this->mBatchSize
+ );
+ $it->addConditions( [
+ 'cl_type' => 'subcat',
+ 'cl_from' => $ids
+ ] );
+ $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
+ return new RecursiveIteratorIterator( $it );
+ }
+
+ public function addDumpHeader( $timestamp ) {
+ global $wgRightsUrl;
+ $licenseUrl = $wgRightsUrl;
+ if ( substr( $licenseUrl, 0, 2 ) == '//' ) {
+ $licenseUrl = 'https:' . $licenseUrl;
+ }
+ $this->rdfWriter->about( wfExpandUrl( '/categoriesDump', PROTO_CANONICAL ) )
+ ->a( 'schema', 'Dataset' )
+ ->a( 'owl', 'Ontology' )
+ ->say( 'cc', 'license' )->is( $licenseUrl )
+ ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
+ ->say( 'schema', 'dateModified' )
+ ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' )
+ ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) )
+ ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
+ }
+
+ public function execute() {
+ $outFile = $this->getOption( 'output', 'php://stdout' );
+
+ if ( $outFile === '-' ) {
+ $outFile = 'php://stdout';
+ }
+
+ $output = fopen( $outFile, 'w' );
+ $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
+ $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
+
+ $this->categoriesRdf->setupPrefixes();
+ $this->rdfWriter->start();
+
+ $this->addDumpHeader( time() );
+ fwrite( $output, $this->rdfWriter->drain() );
+
+ $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
+
+ foreach ( $this->getCategoryIterator( $dbr ) as $batch ) {
+ $pages = [];
+ foreach ( $batch as $row ) {
+ $this->categoriesRdf->writeCategoryData( $row->page_title );
+ $pages[$row->page_id] = $row->page_title;
+ }
+
+ foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
+ $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
+ }
+ fwrite( $output, $this->rdfWriter->drain() );
+ }
+ fflush( $output );
+ if ( $outFile !== '-' ) {
+ fclose( $output );
+ }
+ }
+
+ /**
+ * @param string $format Writer format
+ * @return RdfWriter
+ */
+ private function createRdfWriter( $format ) {
+ $factory = new RdfWriterFactory();
+ return $factory->getWriter( $factory->getFormatName( $format ) );
+ }
+}
+
+$maintClass = "DumpCategoriesAsRdf";
+require_once RUN_MAINTENANCE_IF_MAIN;
$this->mergeMwGlobalArrayValue( 'wgHooks', [ $hookName => [ $handler ] ] );
}
+ /**
+ * Check whether file contains given data.
+ * @param string $fileName
+ * @param string $actualData
+ * @param bool $createIfMissing If true, and file does not exist, create it with given data
+ * and skip the test.
+ * @param string $msg
+ * @since 1.30
+ */
+ protected function assertFileContains(
+ $fileName,
+ $actualData,
+ $createIfMissing = true,
+ $msg = ''
+ ) {
+ if ( $createIfMissing ) {
+ if ( !file_exists( $fileName ) ) {
+ file_put_contents( $fileName, $actualData );
+ $this->markTestSkipped( 'Data file $fileName does not exist' );
+ }
+ } else {
+ self::assertFileExists( $fileName );
+ }
+ self::assertEquals( file_get_contents( $fileName ), $actualData, $msg );
+ }
}
--- /dev/null
+<http://acme.test/categoriesDump> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Dataset> .
+<http://acme.test/categoriesDump> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Ontology> .
+<http://acme.test/categoriesDump> <http://creativecommons.org/ns#license> <https://creativecommons.org/licenses/by-sa/3.0/> .
+<http://acme.test/categoriesDump> <http://schema.org/softwareVersion> "1.0" .
+<http://acme.test/categoriesDump> <http://schema.org/dateModified> "{DATE}"^^<http://www.w3.org/2001/XMLSchema#dateTime> .
+<http://acme.test/categoriesDump> <http://schema.org/isPartOf> <http://acme.test/> .
+<http://acme.test/categoriesDump> <http://www.w3.org/2002/07/owl#imports> <https://www.mediawiki.org/ontology/ontology.owl> .
+<http://acme.test/wiki/Category:Category_One> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.mediawiki.org/ontology#Category> .
+<http://acme.test/wiki/Category:Category_One> <http://www.w3.org/2000/01/rdf-schema#label> "Category One" .
+<http://acme.test/wiki/Category:2_Category_Two> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.mediawiki.org/ontology#Category> .
+<http://acme.test/wiki/Category:2_Category_Two> <http://www.w3.org/2000/01/rdf-schema#label> "2 Category Two" .
+<http://acme.test/wiki/Category:Category_One> <https://www.mediawiki.org/ontology#isInCategory> <http://acme.test/wiki/Category:Parent_of_1> .
+<http://acme.test/wiki/Category:2_Category_Two> <https://www.mediawiki.org/ontology#isInCategory> <http://acme.test/wiki/Category:Parent_of_2> .
+<http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.mediawiki.org/ontology#Category> .
+<http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <http://www.w3.org/2000/01/rdf-schema#label> "\u0422\u0440\u0435\u0442\u044C\u044F \u043A\u0430\u0442\u0435\u0433\u043E\u0440\u0438\u044F" .
+<http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <https://www.mediawiki.org/ontology#isInCategory> <http://acme.test/wiki/Category:Parent_of_3> .
--- /dev/null
+<?php
+
+class CategoriesRdfTest extends MediaWikiLangTestCase {
+ public function getCategoryIterator() {
+ return [
+ // batch 1
+ [
+ (object)[ 'page_title' => 'Category One', 'page_id' => 1 ],
+ (object)[ 'page_title' => '2 Category Two', 'page_id' => 2 ],
+ ],
+ // batch 2
+ [
+ (object)[ 'page_title' => 'Третья категория', 'page_id' => 3 ],
+ ]
+ ];
+ }
+
+ public function getCategoryLinksIterator( $dbr, array $ids ) {
+ $res = [];
+ foreach ( $ids as $pageid ) {
+ $res[] = (object)[ 'cl_from' => $pageid, 'cl_to' => "Parent of $pageid" ];
+ }
+ return $res;
+ }
+
+ public function testCategoriesDump() {
+ $this->setMwGlobals( [
+ 'wgServer' => 'http://acme.test',
+ 'wgCanonicalServer' => 'http://acme.test',
+ 'wgArticlePath' => '/wiki/$1',
+ 'wgRightsUrl' => '//creativecommons.org/licenses/by-sa/3.0/',
+ ] );
+
+ $dumpScript =
+ $this->getMockBuilder( DumpCategoriesAsRdf::class )
+ ->setMethods( [ 'getCategoryIterator', 'getCategoryLinksIterator' ] )
+ ->getMock();
+
+ $dumpScript->expects( $this->once() )
+ ->method( 'getCategoryIterator' )
+ ->willReturn( $this->getCategoryIterator() );
+
+ $dumpScript->expects( $this->any() )
+ ->method( 'getCategoryLinksIterator' )
+ ->willReturnCallback( [ $this, 'getCategoryLinksIterator' ] );
+
+ /** @var DumpCategoriesAsRdf $dumpScript */
+ $logFileName = tempnam( sys_get_temp_dir(), "Categories-DumpRdfTest" );
+ $outFileName = tempnam( sys_get_temp_dir(), "Categories-DumpRdfTest" );
+
+ $dumpScript->loadParamsAndArgs(
+ null,
+ [
+ 'log' => $logFileName,
+ 'output' => $outFileName,
+ 'format' => 'nt',
+ ]
+ );
+
+ $dumpScript->execute();
+ $actualOut = file_get_contents( $outFileName );
+ $actualOut = preg_replace(
+ '|<http://acme.test/categoriesDump> <http://schema.org/dateModified> "[^"]+?"|',
+ '<http://acme.test/categoriesDump> <http://schema.org/dateModified> "{DATE}"',
+ $actualOut
+ );
+
+ $outFile = __DIR__ . '/../data/categoriesrdf/categoriesRdf-out.nt';
+ $this->assertFileContains( $outFile, $actualOut );
+ }
+
+}