174 lines
7.6 KiB
JavaScript
174 lines
7.6 KiB
JavaScript
/**
|
|
* Tests that if the config.version document on a config server is rolled back, that config server
|
|
* will detect the new config.version document when it gets recreated.
|
|
* @tags: [requires_persistence]
|
|
*/
|
|
|
|
(function() {
|
|
"use strict";
|
|
|
|
load("jstests/libs/write_concern_util.js");
|
|
|
|
// The config.version document is written on transition to primary. We need to ensure this
|
|
// config.version document is rolled back for this test.
|
|
//
|
|
// This means we have to guarantee the config.version document is not replicated by a secondary
|
|
// during any of 1) initial sync, 2) steady state replication, or 3) catchup after election.
|
|
//
|
|
// 1) initial sync
|
|
// We need non-primaries to finish initial sync so that they are electable, but without
|
|
// replicating the config.version document. Since we can't control when the config.version
|
|
// document is written (it's an internal write, not a client write), we turn on a failpoint
|
|
// that stalls the write of the config.version document until we have ascertained that the
|
|
// secondaries have finished initial sync.
|
|
//
|
|
// 2) steady state replication
|
|
// Once the non-primaries have transitioned to secondary, we stop the secondaries from
|
|
// replicating anything further by turning on a failpoint that stops the OplogFetcher. We then
|
|
// allow the primary to write the config.verison document.
|
|
//
|
|
// 3) catchup after election
|
|
// When the primary is stepped down and one of the secondaries is elected, the new primary will
|
|
// notice that it is behind the original primary and try to catchup for a short period. So, we
|
|
// also ensure that this short period is 0 by setting catchupTimeoutMillis to 0 earlier in the
|
|
// ReplSetConfig passed to initiate().
|
|
//
|
|
// Thus, we guarantee the new primary will not have replicated the config.version document in
|
|
// initial sync, steady state replication, or catchup, so the document will be rolled back.
|
|
|
|
jsTest.log("Starting the replica set and waiting for secondaries to finish initial sync");
|
|
var configRS = new ReplSetTest({nodes: 3});
|
|
var nodes = configRS.startSet({
|
|
configsvr: '',
|
|
storageEngine: 'wiredTiger',
|
|
setParameter: {
|
|
"failpoint.transitionToPrimaryHangBeforeTakingGlobalExclusiveLock":
|
|
"{'mode':'alwaysOn'}"
|
|
}
|
|
});
|
|
var conf = configRS.getReplSetConfig();
|
|
conf.settings = {catchUpTimeoutMillis: 0};
|
|
|
|
// Ensure conf.members[0] is the only node that can become primary at first, so we know on which
|
|
// nodes to wait for transition to SECONDARY.
|
|
conf.members[1].priority = 0;
|
|
conf.members[2].priority = 0;
|
|
configRS.nodes[0].adminCommand({replSetInitiate: conf});
|
|
|
|
jsTest.log("Waiting for " + nodes[1] + " and " + nodes[2] + " to transition to SECONDARY.");
|
|
configRS.waitForState([nodes[1], nodes[2]], ReplSetTest.State.SECONDARY);
|
|
|
|
jsTest.log("Stopping the replication producer on all nodes");
|
|
// Now that the secondaries have finished initial sync and are electable, stop replication.
|
|
stopServerReplication([nodes[1], nodes[2]]);
|
|
|
|
jsTest.log("Allowing the primary to write the config.version doc");
|
|
nodes.forEach(function(node) {
|
|
assert.commandWorked(node.adminCommand({
|
|
configureFailPoint: "transitionToPrimaryHangBeforeTakingGlobalExclusiveLock",
|
|
mode: "off"
|
|
}));
|
|
});
|
|
|
|
var origPriConn = configRS.getPrimary();
|
|
var secondaries = configRS.getSecondaries();
|
|
|
|
jsTest.log("Confirming that the primary has the config.version doc but the secondaries do not");
|
|
var origConfigVersionDoc;
|
|
assert.soon(function() {
|
|
origConfigVersionDoc = origPriConn.getCollection('config.version').findOne();
|
|
return null !== origConfigVersionDoc;
|
|
});
|
|
secondaries.forEach(function(secondary) {
|
|
secondary.setSlaveOk();
|
|
assert.eq(null, secondary.getCollection('config.version').findOne());
|
|
});
|
|
|
|
jsTest.log("Checking that manually deleting the config.version document is not allowed.");
|
|
assert.writeErrorWithCode(origPriConn.getCollection('config.version').remove({}), 40302);
|
|
assert.commandFailedWithCode(origPriConn.getDB('config').runCommand({drop: 'version'}), 40303);
|
|
|
|
jsTest.log("Making the secondaries electable by giving all nodes non-zero, equal priority.");
|
|
var res = configRS.getPrimary().adminCommand({replSetGetConfig: 1});
|
|
assert.commandWorked(res);
|
|
conf = res.config;
|
|
conf.members[0].priority = 1;
|
|
conf.members[1].priority = 1;
|
|
conf.members[2].priority = 1;
|
|
conf.version++;
|
|
configRS.getPrimary().adminCommand({replSetReconfig: conf});
|
|
|
|
jsTest.log("Stepping down original primary");
|
|
try {
|
|
origPriConn.adminCommand({replSetStepDown: 60, force: true});
|
|
} catch (x) {
|
|
// replSetStepDown closes all connections, thus a network exception is expected here.
|
|
}
|
|
|
|
jsTest.log("Waiting for new primary to be elected and write a new config.version document");
|
|
var newPriConn = configRS.getPrimary();
|
|
assert.neq(newPriConn, origPriConn);
|
|
|
|
var newConfigVersionDoc = newPriConn.getCollection('config.version').findOne();
|
|
assert.neq(null, newConfigVersionDoc);
|
|
assert.neq(origConfigVersionDoc.clusterId, newConfigVersionDoc.clusterId);
|
|
|
|
jsTest.log("Re-enabling replication on all nodes");
|
|
restartServerReplication([nodes[1], nodes[2]]);
|
|
|
|
jsTest.log(
|
|
"Waiting for original primary to rollback and replicate new config.version document");
|
|
configRS.waitForState(origPriConn, ReplSetTest.State.SECONDARY);
|
|
origPriConn.setSlaveOk();
|
|
assert.soonNoExcept(function() {
|
|
var foundClusterId = origPriConn.getCollection('config.version').findOne().clusterId;
|
|
return friendlyEqual(newConfigVersionDoc.clusterId, foundClusterId);
|
|
});
|
|
|
|
jsTest.log("Forcing original primary to step back up and become primary again.");
|
|
|
|
// Do prep work to make original primary transtion to primary again smoother by
|
|
// waiting for all nodes to catch up to make them eligible to become primary and
|
|
// step down the current primary to make it stop generating new oplog entries.
|
|
configRS.awaitReplication();
|
|
|
|
try {
|
|
newPriConn.adminCommand({replSetStepDown: 60, force: true});
|
|
} catch (x) {
|
|
// replSetStepDown closes all connections, thus a network exception is expected here.
|
|
}
|
|
|
|
// Ensure former primary is eligible to become primary once more.
|
|
assert.commandWorked(origPriConn.adminCommand({replSetFreeze: 0}));
|
|
|
|
// Keep on trying until this node becomes the primary. One reason it can fail is when the other
|
|
// nodes have newer oplog entries and will thus refuse to vote for this node.
|
|
assert.soon(function() {
|
|
return (origPriConn.adminCommand({replSetStepUp: 1})).ok;
|
|
});
|
|
|
|
assert.soon(function() {
|
|
return origPriConn == configRS.getPrimary();
|
|
});
|
|
|
|
// Now we just need to start up a mongos and add a shard to confirm that the shard gets added
|
|
// with the proper clusterId value.
|
|
jsTest.log("Starting mongos");
|
|
var mongos = MongoRunner.runMongos({configdb: configRS.getURL()});
|
|
|
|
jsTest.log("Starting shard mongod");
|
|
var shard = MongoRunner.runMongod({shardsvr: ""});
|
|
|
|
jsTest.log("Adding shard to cluster");
|
|
assert.commandWorked(mongos.adminCommand({addShard: shard.host}));
|
|
|
|
jsTest.log("Verifying that shard was provided the proper clusterId");
|
|
var shardIdentityDoc = shard.getDB('admin').system.version.findOne({_id: 'shardIdentity'});
|
|
printjson(shardIdentityDoc);
|
|
assert.eq(newConfigVersionDoc.clusterId,
|
|
shardIdentityDoc.clusterId,
|
|
"oldPriClusterId: " + origConfigVersionDoc.clusterId);
|
|
configRS.stopSet();
|
|
|
|
})();
|