Mailing List Archive

[MediaWiki-commits] [Gerrit] mediawiki...Wikibase[master]: Allow continuing Wikibase entity dumps
Hoo man has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/405739 )

Change subject: Allow continuing Wikibase entity dumps
......................................................................

Allow continuing Wikibase entity dumps

This only works when SqlEntityIdPager is used internally,
but that IMO also is the only place this makes sense.

Bug: T177550
Change-Id: I05856c8022969b427a4b8045b54afb15011ff6be
---
M repo/includes/Dumpers/DumpGenerator.php
M repo/maintenance/dumpEntities.php
A repo/tests/phpunit/data/maintenance/dumpJson-limit2-log.txt
A repo/tests/phpunit/data/maintenance/dumpJson-limit2-out.txt
M repo/tests/phpunit/maintenance/dumpJsonTest.php
5 files changed, 59 insertions(+), 12 deletions(-)


git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/39/405739/1

diff --git a/repo/includes/Dumpers/DumpGenerator.php b/repo/includes/Dumpers/DumpGenerator.php
index 8afc47b..a36a528 100644
--- a/repo/includes/Dumpers/DumpGenerator.php
+++ b/repo/includes/Dumpers/DumpGenerator.php
@@ -13,6 +13,7 @@
use Wikibase\Lib\Reporting\RethrowingExceptionHandler;
use Wikibase\Lib\Store\StorageException;
use Wikibase\DataModel\Services\EntityId\EntityIdPager;
+use Wikibase\Repo\Store\Sql\SqlEntityIdPager;

/**
* DumpGenerator generates a dump of a given set of entities, excluding
@@ -243,7 +244,10 @@

// Iterate over batches of IDs, maintaining the current position of the pager in the $position variable.
while ( true ) {
- $ids = $idPager->fetchIds( $this->batchSize );
+ // Try not to overrun $limit in order to make sure pager's position can be used for continuing.
+ $limit = ( $dumpCount + $this->batchSize ) >= $this->limit ? $this->limit - $dumpCount : $this->batchSize;
+
+ $ids = $idPager->fetchIds( $limit );
if ( !$ids ) {
break;
}
@@ -253,6 +257,13 @@
$this->progressReporter->reportMessage( 'Processed ' . $dumpCount . ' entities.' );

if ( $this->limit && $dumpCount >= $this->limit ) {
+ $this->progressReporter->reportMessage( 'Reached entity dump limit of ' . $this->limit . '.' );
+
+ if ( $idPager instanceof SqlEntityIdPager ) {
+ // This message is possibly being parsed for continuation purposes, thus avoid changing it.
+ $this->progressReporter->reportMessage( 'Last SqlEntityIdPager position: ' . $idPager->getPosition() . '.' );
+ }
+
break;
}
}
diff --git a/repo/maintenance/dumpEntities.php b/repo/maintenance/dumpEntities.php
index 5f887f6..d8dd804 100644
--- a/repo/maintenance/dumpEntities.php
+++ b/repo/maintenance/dumpEntities.php
@@ -52,6 +52,7 @@
$this->addOption( 'quiet', "Disable progress reporting", false, false );
$this->addOption( 'limit', "Limit how many entities are dumped.", false, true );
$this->addOption( 'no-cache', "If this is set, don't try to read from an EntityRevisionCache.", false, false );
+ $this->addOption( 'continue', 'Continue parameter for SqlEntityIdPager. Not compatible with --list-file.', false, true );
}

public function setDumpEntitiesServices( SqlEntityIdPagerFactory $sqlEntityIdPagerFactory ) {
@@ -230,7 +231,14 @@
* @return SqlEntityIdPager
*/
private function makeIdQueryStream( $entityType ) {
- return $this->sqlEntityIdPagerFactory->newSqlEntityIdPager( $entityType, $this->getRedirectMode() );
+ $sqlEntityIdPager = $this->sqlEntityIdPagerFactory->newSqlEntityIdPager( $entityType, $this->getRedirectMode() );
+
+ $continue = $this->getOption( 'continue', null );
+ if ( $continue ) {
+ $sqlEntityIdPager->setPosition( intval( $continue ) );
+ }
+
+ return $sqlEntityIdPager;
}

/**
diff --git a/repo/tests/phpunit/data/maintenance/dumpJson-limit2-log.txt b/repo/tests/phpunit/data/maintenance/dumpJson-limit2-log.txt
new file mode 100644
index 0000000..d5a768c
--- /dev/null
+++ b/repo/tests/phpunit/data/maintenance/dumpJson-limit2-log.txt
@@ -0,0 +1,3 @@
+Dumping shard 0/1
+Processed 2 entities.
+Reached entity dump limit of 2.
diff --git a/repo/tests/phpunit/data/maintenance/dumpJson-limit2-out.txt b/repo/tests/phpunit/data/maintenance/dumpJson-limit2-out.txt
new file mode 100644
index 0000000..5e8a480
--- /dev/null
+++ b/repo/tests/phpunit/data/maintenance/dumpJson-limit2-out.txt
@@ -0,0 +1,4 @@
+[
+{"type":"item","id":"Q1","labels":{},"descriptions":{},"aliases":{},"claims":{},"sitelinks":{}},
+{"type":"property","datatype":"string","id":"P1","labels":{},"descriptions":{},"aliases":{},"claims":{}}
+]
diff --git a/repo/tests/phpunit/maintenance/dumpJsonTest.php b/repo/tests/phpunit/maintenance/dumpJsonTest.php
index 7530ed1..a9f55a7 100644
--- a/repo/tests/phpunit/maintenance/dumpJsonTest.php
+++ b/repo/tests/phpunit/maintenance/dumpJsonTest.php
@@ -45,7 +45,7 @@
*/
class DumpJsonTest extends MediaWikiTestCase {

- public function testScript() {
+ private function getDumpJson() {
$dumpScript = new DumpJson();

$mockRepo = new MockRepository();
@@ -141,21 +141,42 @@
$serializerFactory->newEntitySerializer()
);

+ return $dumpScript;
+ }
+
+ public function dumpParameterProvider() {
+ return [
+ 'dump everything' => [
+ [],
+ __DIR__ . '/../data/maintenance/dumpJson-log.txt',
+ __DIR__ . '/../data/maintenance/dumpJson-out.txt',
+ ],
+ 'dump with limit 2' => [
+ [
+ 'limit' => 2,
+ ],
+ __DIR__ . '/../data/maintenance/dumpJson-limit2-log.txt',
+ __DIR__ . '/../data/maintenance/dumpJson-limit2-out.txt',
+ ]
+ ];
+ }
+
+ /**
+ * @dataProvider dumpParameterProvider
+ */
+ public function testScript( array $opts, $expectedLogFile, $expectedOutFile ) {
+ $dumpScript = $this->getDumpJson();
+
$logFileName = tempnam( sys_get_temp_dir(), "Wikibase-DumpJsonTest" );
$outFileName = tempnam( sys_get_temp_dir(), "Wikibase-DumpJsonTest" );

- $dumpScript->loadParamsAndArgs(
- null,
- [
- 'log' => $logFileName,
- 'output' => $outFileName,
- ]
- );
+ $opts = $opts + [ 'log' => $logFileName, 'output' => $outFileName ];
+ $dumpScript->loadParamsAndArgs( null, $opts );

$dumpScript->execute();

- $expectedLog = file_get_contents( __DIR__ . '/../data/maintenance/dumpJson-log.txt' );
- $expectedOut = file_get_contents( __DIR__ . '/../data/maintenance/dumpJson-out.txt' );
+ $expectedLog = file_get_contents( $expectedLogFile );
+ $expectedOut = file_get_contents( $expectedOutFile );

$this->assertEquals(
$this->fixLineEndings( $expectedLog ),

--
To view, visit https://gerrit.wikimedia.org/r/405739
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I05856c8022969b427a4b8045b54afb15011ff6be
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Hoo man <hoo@online.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] mediawiki...Wikibase[master]: Allow continuing Wikibase entity dumps [ In reply to ]
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/405739 )

Change subject: Allow continuing Wikibase entity dumps
......................................................................


Allow continuing Wikibase entity dumps

This only works when SqlEntityIdPager is used internally,
but that IMO also is the only place this makes sense.

Bug: T177550
Change-Id: I05856c8022969b427a4b8045b54afb15011ff6be
---
M repo/includes/Dumpers/DumpGenerator.php
M repo/maintenance/dumpEntities.php
A repo/tests/phpunit/data/maintenance/dumpJson-limit2-log.txt
A repo/tests/phpunit/data/maintenance/dumpJson-limit2-out.txt
M repo/tests/phpunit/maintenance/dumpJsonTest.php
5 files changed, 63 insertions(+), 12 deletions(-)

Approvals:
Ladsgroup: Looks good to me, approved
jenkins-bot: Verified



diff --git a/repo/includes/Dumpers/DumpGenerator.php b/repo/includes/Dumpers/DumpGenerator.php
index 8afc47b..73049ff 100644
--- a/repo/includes/Dumpers/DumpGenerator.php
+++ b/repo/includes/Dumpers/DumpGenerator.php
@@ -13,6 +13,7 @@
use Wikibase\Lib\Reporting\RethrowingExceptionHandler;
use Wikibase\Lib\Store\StorageException;
use Wikibase\DataModel\Services\EntityId\EntityIdPager;
+use Wikibase\Repo\Store\Sql\SqlEntityIdPager;

/**
* DumpGenerator generates a dump of a given set of entities, excluding
@@ -243,7 +244,14 @@

// Iterate over batches of IDs, maintaining the current position of the pager in the $position variable.
while ( true ) {
- $ids = $idPager->fetchIds( $this->batchSize );
+ if ( $this->limit && ( $dumpCount + $this->batchSize ) > $this->limit ) {
+ // Try not to overrun $limit in order to make sure pager's position can be used for continuing.
+ $limit = $this->limit - $dumpCount;
+ } else {
+ $limit = $this->batchSize;
+ }
+
+ $ids = $idPager->fetchIds( $limit );
if ( !$ids ) {
break;
}
@@ -253,6 +261,13 @@
$this->progressReporter->reportMessage( 'Processed ' . $dumpCount . ' entities.' );

if ( $this->limit && $dumpCount >= $this->limit ) {
+ $this->progressReporter->reportMessage( 'Reached entity dump limit of ' . $this->limit . '.' );
+
+ if ( $idPager instanceof SqlEntityIdPager ) {
+ // This message is possibly being parsed for continuation purposes, thus avoid changing it.
+ $this->progressReporter->reportMessage( 'Last SqlEntityIdPager position: ' . $idPager->getPosition() . '.' );
+ }
+
break;
}
}
diff --git a/repo/maintenance/dumpEntities.php b/repo/maintenance/dumpEntities.php
index 5f887f6..d8dd804 100644
--- a/repo/maintenance/dumpEntities.php
+++ b/repo/maintenance/dumpEntities.php
@@ -52,6 +52,7 @@
$this->addOption( 'quiet', "Disable progress reporting", false, false );
$this->addOption( 'limit', "Limit how many entities are dumped.", false, true );
$this->addOption( 'no-cache', "If this is set, don't try to read from an EntityRevisionCache.", false, false );
+ $this->addOption( 'continue', 'Continue parameter for SqlEntityIdPager. Not compatible with --list-file.', false, true );
}

public function setDumpEntitiesServices( SqlEntityIdPagerFactory $sqlEntityIdPagerFactory ) {
@@ -230,7 +231,14 @@
* @return SqlEntityIdPager
*/
private function makeIdQueryStream( $entityType ) {
- return $this->sqlEntityIdPagerFactory->newSqlEntityIdPager( $entityType, $this->getRedirectMode() );
+ $sqlEntityIdPager = $this->sqlEntityIdPagerFactory->newSqlEntityIdPager( $entityType, $this->getRedirectMode() );
+
+ $continue = $this->getOption( 'continue', null );
+ if ( $continue ) {
+ $sqlEntityIdPager->setPosition( intval( $continue ) );
+ }
+
+ return $sqlEntityIdPager;
}

/**
diff --git a/repo/tests/phpunit/data/maintenance/dumpJson-limit2-log.txt b/repo/tests/phpunit/data/maintenance/dumpJson-limit2-log.txt
new file mode 100644
index 0000000..d5a768c
--- /dev/null
+++ b/repo/tests/phpunit/data/maintenance/dumpJson-limit2-log.txt
@@ -0,0 +1,3 @@
+Dumping shard 0/1
+Processed 2 entities.
+Reached entity dump limit of 2.
diff --git a/repo/tests/phpunit/data/maintenance/dumpJson-limit2-out.txt b/repo/tests/phpunit/data/maintenance/dumpJson-limit2-out.txt
new file mode 100644
index 0000000..5e8a480
--- /dev/null
+++ b/repo/tests/phpunit/data/maintenance/dumpJson-limit2-out.txt
@@ -0,0 +1,4 @@
+[
+{"type":"item","id":"Q1","labels":{},"descriptions":{},"aliases":{},"claims":{},"sitelinks":{}},
+{"type":"property","datatype":"string","id":"P1","labels":{},"descriptions":{},"aliases":{},"claims":{}}
+]
diff --git a/repo/tests/phpunit/maintenance/dumpJsonTest.php b/repo/tests/phpunit/maintenance/dumpJsonTest.php
index 7530ed1..a9f55a7 100644
--- a/repo/tests/phpunit/maintenance/dumpJsonTest.php
+++ b/repo/tests/phpunit/maintenance/dumpJsonTest.php
@@ -45,7 +45,7 @@
*/
class DumpJsonTest extends MediaWikiTestCase {

- public function testScript() {
+ private function getDumpJson() {
$dumpScript = new DumpJson();

$mockRepo = new MockRepository();
@@ -141,21 +141,42 @@
$serializerFactory->newEntitySerializer()
);

+ return $dumpScript;
+ }
+
+ public function dumpParameterProvider() {
+ return [
+ 'dump everything' => [
+ [],
+ __DIR__ . '/../data/maintenance/dumpJson-log.txt',
+ __DIR__ . '/../data/maintenance/dumpJson-out.txt',
+ ],
+ 'dump with limit 2' => [
+ [
+ 'limit' => 2,
+ ],
+ __DIR__ . '/../data/maintenance/dumpJson-limit2-log.txt',
+ __DIR__ . '/../data/maintenance/dumpJson-limit2-out.txt',
+ ]
+ ];
+ }
+
+ /**
+ * @dataProvider dumpParameterProvider
+ */
+ public function testScript( array $opts, $expectedLogFile, $expectedOutFile ) {
+ $dumpScript = $this->getDumpJson();
+
$logFileName = tempnam( sys_get_temp_dir(), "Wikibase-DumpJsonTest" );
$outFileName = tempnam( sys_get_temp_dir(), "Wikibase-DumpJsonTest" );

- $dumpScript->loadParamsAndArgs(
- null,
- [
- 'log' => $logFileName,
- 'output' => $outFileName,
- ]
- );
+ $opts = $opts + [ 'log' => $logFileName, 'output' => $outFileName ];
+ $dumpScript->loadParamsAndArgs( null, $opts );

$dumpScript->execute();

- $expectedLog = file_get_contents( __DIR__ . '/../data/maintenance/dumpJson-log.txt' );
- $expectedOut = file_get_contents( __DIR__ . '/../data/maintenance/dumpJson-out.txt' );
+ $expectedLog = file_get_contents( $expectedLogFile );
+ $expectedOut = file_get_contents( $expectedOutFile );

$this->assertEquals(
$this->fixLineEndings( $expectedLog ),

--
To view, visit https://gerrit.wikimedia.org/r/405739
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I05856c8022969b427a4b8045b54afb15011ff6be
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Hoo man <hoo@online.de>
Gerrit-Reviewer: Addshore <addshorewiki@gmail.com>
Gerrit-Reviewer: ArielGlenn <ariel@wikimedia.org>
Gerrit-Reviewer: Daniel Kinzler <daniel.kinzler@wikimedia.de>
Gerrit-Reviewer: Hoo man <hoo@online.de>
Gerrit-Reviewer: Ladsgroup <Ladsgroup@gmail.com>
Gerrit-Reviewer: Lucas Werkmeister (WMDE) <lucas.werkmeister@wikimedia.de>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits