Files
mongo/jstests/sharding/remove2.js
2018-06-25 16:47:18 -04:00

207 lines
7.7 KiB
JavaScript

/**
* Test that removing and re-adding shard works correctly.
*
* This test is labeled resource intensive because its total io_write is 59MB compared to a median
* of 5MB across all sharding tests in wiredTiger.
* @tags: [resource_intensive]
*/
load("jstests/replsets/rslib.js");
// The UUID consistency check uses connections to shards cached on the ShardingTest object, but this
// test restarts a shard, so the cached connection is not usable.
TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
(function() {
'use strict';
function seedString(replTest) {
var members = replTest.getReplSetConfig().members.map(function(elem) {
return elem.host;
});
return replTest.name + '/' + members.join(',');
}
function removeShard(st, replTest) {
jsTest.log("Removing shard with name: " + replTest.name);
var res = st.s.adminCommand({removeShard: replTest.name});
assert.commandWorked(res);
assert.eq('started', res.state);
assert.soon(function() {
res = st.s.adminCommand({removeShard: replTest.name});
assert.commandWorked(res);
return ('completed' === res.state);
}, "failed to remove shard: " + tojson(res));
// Drop the database so the shard can be re-added.
assert.commandWorked(replTest.getPrimary().getDB(coll.getDB().getName()).dropDatabase());
}
function addShard(st, replTest) {
var seed = seedString(replTest);
print("Adding shard with seed: " + seed);
try {
assert.eq(true, st.adminCommand({addshard: seed}));
} catch (e) {
print("First attempt to addShard failed, trying again");
// transport error on first attempt is expected. Make sure second attempt goes through
assert.eq(true, st.adminCommand({addshard: seed}));
}
awaitRSClientHosts(
new Mongo(st.s.host), replTest.getSecondaries(), {ok: true, secondary: true});
assert.soon(function() {
var x = st.chunkDiff(coll.getName(), coll.getDB().getName());
print("chunk diff: " + x);
return x < 2;
}, "no balance happened", 30 * 60 * 1000);
try {
assert.eq(300, coll.find().itcount());
} catch (e) {
// Expected. First query might get transport error and need to reconnect.
printjson(e);
assert.eq(300, coll.find().itcount());
}
print("Shard added successfully");
}
var st = new ShardingTest(
{shards: {rs0: {nodes: 2}, rs1: {nodes: 2}}, other: {chunkSize: 1, enableBalancer: true}});
// Pending resolution of SERVER-8598, we need to wait for deletion after chunk migrations to
// avoid a pending delete re-creating a database after it was dropped.
st.s.getDB("config").settings.update({_id: "balancer"}, {$set: {_waitForDelete: true}}, true);
var conn = new Mongo(st.s.host);
var coll = conn.getCollection("test.remove2");
coll.drop();
assert.commandWorked(st.s0.adminCommand({enableSharding: coll.getDB().getName()}));
st.ensurePrimaryShard(coll.getDB().getName(), st.shard0.shardName);
assert.commandWorked(st.s0.adminCommand({shardCollection: coll.getFullName(), key: {i: 1}}));
// Setup initial data
var str = 'a';
while (str.length < 1024 * 16) {
str += str;
}
var bulk = coll.initializeUnorderedBulkOp();
for (var i = 0; i < 300; i++) {
bulk.insert({i: i % 10, str: str});
}
assert.writeOK(bulk.execute());
assert.eq(300, coll.find().itcount());
assert.soon(function() {
var x = st.chunkDiff('remove2', "test");
print("chunk diff: " + x);
return x < 2;
}, "no balance happened", 30 * 60 * 1000);
assert.eq(300, coll.find().itcount());
st.printShardingStatus();
var rst1 = st.rs1;
// Remove shard and add it back in, without shutting it down.
jsTestLog("Attempting to remove shard and add it back in");
removeShard(st, rst1);
addShard(st, rst1);
// Remove shard, restart set, then add it back in.
jsTestLog("Attempting to remove shard, restart the set, and then add it back in");
var originalSeed = seedString(rst1);
removeShard(st, rst1);
rst1.stopSet();
print("Sleeping for 20 seconds to let the other shard's ReplicaSetMonitor time out");
sleep(20000); // 1 failed check should take 10 seconds, sleep for 20 just to be safe
rst1.startSet({restart: true});
rst1.initiate();
rst1.awaitReplication();
assert.eq(
originalSeed, seedString(rst1), "Set didn't come back up with the same hosts as before");
addShard(st, rst1);
// Shut down shard and wait for its ReplicaSetMonitor to be cleaned up, then start it back up
// and use it.
//
// TODO: test this both with AND without waiting for the ReplicaSetMonitor to be cleaned up.
//
// This part doesn't pass, even without cleaning up the ReplicaSetMonitor - see SERVER-5900.
/*
printjson( conn.getDB('admin').runCommand({movePrimary : 'test2', to : rst1.name}) );
printjson( conn.getDB('admin').runCommand({setParameter : 1, replMonitorMaxFailedChecks : 5}) );
jsTestLog( "Shutting down set" )
rst1.stopSet();
jsTestLog( "sleeping for 20 seconds to make sure ReplicaSetMonitor gets cleaned up");
sleep(20000); // 1 failed check should take 10 seconds, sleep for 20 just to be safe
// Should fail since rst1 is the primary for test2
assert.throws(function() {conn.getDB('test2').foo.find().itcount()});
jsTestLog( "Bringing set back up" );
rst1.startSet();
rst1.initiate();
rst1.awaitReplication();
jsTestLog( "Checking that set is usable again" );
//conn.getDB('admin').runCommand({flushRouterConfig:1}); // Uncommenting this makes test pass
conn.getDB('test2').foo.insert({a:1});
gle = conn.getDB('test2').runCommand('getLastError');
if ( !gle.ok ) {
// Expected. First write will fail and need to re-connect
print( "write failed" );
printjson( gle );
conn.getDB('test2').foo.insert({a:1});
assert( conn.getDB('test2').getLastErrorObj().ok );
}
assert.eq( 1, conn.getDB('test2').foo.find().itcount() );
assert( conn.getDB('test2').dropDatabase().ok );
*/
// Remove shard and add a new shard with the same replica set and shard name, but different
// ports
jsTestLog("Attempt removing shard and adding a new shard with the same Replica Set name");
removeShard(st, rst1);
rst1.stopSet();
print("Sleeping for 60 seconds to let the other shards restart their ReplicaSetMonitors");
sleep(60000);
var rst2 = new ReplSetTest({name: rst1.name, nodes: 2, useHostName: true});
rst2.startSet({shardsvr: ""});
rst2.initiate();
rst2.awaitReplication();
addShard(st, rst2);
printjson(st.admin.runCommand({movePrimary: 'test2', to: rst2.name}));
assert.eq(300, coll.find().itcount());
conn.getDB('test2').foo.insert({a: 1});
assert.eq(1, conn.getDB('test2').foo.find().itcount());
// Can't shut down with rst2 in the set or ShardingTest will fail trying to cleanup on shutdown.
// Have to take out rst2 and put rst1 back into the set so that it can clean up.
jsTestLog("Putting ShardingTest back to state it expects");
printjson(st.admin.runCommand({movePrimary: 'test2', to: st.rs0.name}));
removeShard(st, rst2);
rst2.stopSet();
print("Sleeping for 60 seconds to let the other shards restart their ReplicaSetMonitors");
sleep(60000);
rst1.startSet({restart: true});
rst1.initiate();
rst1.awaitReplication();
assert.eq(
originalSeed, seedString(rst1), "Set didn't come back up with the same hosts as before");
addShard(st, rst1);
st.stop();
})();