Files
mongo/jstests/sharding/libs/resharding_test_fixture.js
2022-10-11 23:06:20 +00:00

983 lines
43 KiB
JavaScript

"use strict";
load("jstests/libs/fail_point_util.js");
load("jstests/libs/parallelTester.js");
load("jstests/libs/uuid_util.js");
load("jstests/libs/write_concern_util.js");
load("jstests/sharding/libs/create_sharded_collection_util.js");
/**
* Test fixture for resharding a sharded collection once.
*
* Example usage:
*
* const reshardingTest = new ReshardingTest();
* reshardingTest.setup();
* const sourceCollection = reshardingTest.createShardedCollection(...);
* // ... Do some operations before resharding starts ...
* assert.commandWorked(sourceCollection.insert({_id: 0}));
* reshardingTest.withReshardingInBackground({...}, () => {
* // ... Do some operations during the resharding operation ...
* assert.commandWorked(sourceCollection.update({_id: 0}, {$inc: {a: 1}}));
* });
* reshardingTest.teardown();
*/
var ReshardingTest = class {
constructor({
numDonors: numDonors = 1,
numRecipients: numRecipients = 1,
reshardInPlace: reshardInPlace = false,
minimumOperationDurationMS: minimumOperationDurationMS = undefined,
criticalSectionTimeoutMS: criticalSectionTimeoutMS = 24 * 60 * 60 * 1000 /* 1 day */,
periodicNoopIntervalSecs: periodicNoopIntervalSecs = undefined,
writePeriodicNoops: writePeriodicNoops = undefined,
enableElections: enableElections = false,
logComponentVerbosity: logComponentVerbosity = undefined,
storeFindAndModifyImagesInSideCollection: storeFindAndModifyImagesInSideCollection = true,
oplogSize: oplogSize = undefined,
maxNumberOfTransactionOperationsInSingleOplogEntry:
maxNumberOfTransactionOperationsInSingleOplogEntry = undefined
} = {}) {
// The @private JSDoc comments cause VS Code to not display the corresponding properties and
// methods in its autocomplete list. This makes it simpler for test authors to know what the
// public interface of the ReshardingTest class is.
/** @private */
this._numDonors = numDonors;
/** @private */
this._numRecipients = numRecipients;
/** @private */
this._reshardInPlace = reshardInPlace;
/** @private */
this._numShards = this._reshardInPlace ? Math.max(this._numDonors, this._numRecipients)
: this._numDonors + this._numRecipients;
/** @private */
this._minimumOperationDurationMS = minimumOperationDurationMS;
/** @private */
this._criticalSectionTimeoutMS = criticalSectionTimeoutMS;
/** @private */
this._periodicNoopIntervalSecs = periodicNoopIntervalSecs;
/** @private */
this._writePeriodicNoops = writePeriodicNoops;
/** @private */
this._enableElections = enableElections;
/** @private */
this._logComponentVerbosity = logComponentVerbosity;
/** @private */
this._storeFindAndModifyImagesInSideCollection = storeFindAndModifyImagesInSideCollection;
this._oplogSize = oplogSize;
this._maxNumberOfTransactionOperationsInSingleOplogEntry =
maxNumberOfTransactionOperationsInSingleOplogEntry;
// Properties set by setup().
/** @private */
this._st = undefined;
// Properties set by createShardedCollection().
/** @private */
this._ns = undefined;
/** @private */
this._currentShardKey = undefined;
/** @private */
this._sourceCollectionUUID = undefined;
/** @private */
this._tempNs = undefined;
/** @private */
this._primaryShardName = undefined;
// Properties set by startReshardingInBackground() and withReshardingInBackground().
/** @private */
this._newShardKey = undefined;
/** @private */
this._pauseCoordinatorBeforeBlockingWritesFailpoints = [];
/** @private */
this._pauseCoordinatorBeforeDecisionPersistedFailpoints = [];
/** @private */
this._pauseCoordinatorBeforeCompletionFailpoints = [];
/** @private */
this._reshardingThread = undefined;
/** @private */
this._isReshardingActive = false;
/** @private */
this._commandDoneSignal = undefined;
}
setup() {
const mongosOptions = {setParameter: {}};
const configOptions = {setParameter: {}};
const rsOptions = {
setParameter: {
storeFindAndModifyImagesInSideCollection:
this._storeFindAndModifyImagesInSideCollection
}
};
if (this._oplogSize) {
rsOptions.oplogSize = this._oplogSize;
}
const configReplSetTestOptions = {};
let nodesPerShard = 2;
let nodesPerConfigRs = 1;
if (this._enableElections) {
nodesPerShard = 3;
nodesPerConfigRs = 3;
// Increase the likelihood that writes which aren't yet majority-committed end up
// getting rolled back.
rsOptions.settings = {catchUpTimeoutMillis: 0};
configReplSetTestOptions.settings = {catchUpTimeoutMillis: 0};
rsOptions.setParameter.enableElectionHandoff = 0;
configOptions.setParameter.enableElectionHandoff = 0;
// The server conservatively sets the minimum visible timestamp of collections created
// after the oldest_timestamp to be the stable_timestamp. Furthermore, there is no
// guarantee the oldest_timestamp will advance past the creation timestamp of the source
// sharded collection. This means that after a donor shard restarts an atClusterTime
// read at the cloneTimestamp on it would fail with SnapshotUnavailable. We enable the
// following failpoint so the minimum visible timestamp is set to the oldest_timestamp
// regardless. Note that this is safe for resharding tests to do because the source
// sharded collection is guaranteed to exist in the collection catalog at the
// cloneTimestamp and tests involving elections do not run operations which would bump
// the minimum visible timestamp (e.g. creating or dropping indexes).
rsOptions.setParameter["failpoint.setMinVisibleForAllCollectionsToOldestOnStartup"] =
tojson({mode: "alwaysOn"});
}
if (this._minimumOperationDurationMS !== undefined) {
configOptions.setParameter.reshardingMinimumOperationDurationMillis =
this._minimumOperationDurationMS;
}
if (this._criticalSectionTimeoutMS !== -1) {
configOptions.setParameter.reshardingCriticalSectionTimeoutMillis =
this._criticalSectionTimeoutMS;
}
if (this._periodicNoopIntervalSecs !== undefined) {
rsOptions.setParameter.periodicNoopIntervalSecs = this._periodicNoopIntervalSecs;
}
if (this._writePeriodicNoops !== undefined) {
rsOptions.setParameter.writePeriodicNoops = this._writePeriodicNoops;
}
if (this._logComponentVerbosity !== undefined) {
rsOptions.setParameter.logComponentVerbosity = this._logComponentVerbosity;
configOptions.setParameter.logComponentVerbosity = this._logComponentVerbosity;
mongosOptions.setParameter.logComponentVerbosity = this._logComponentVerbosity;
}
if (this._maxNumberOfTransactionOperationsInSingleOplogEntry !== undefined) {
rsOptions.setParameter.maxNumberOfTransactionOperationsInSingleOplogEntry =
this._maxNumberOfTransactionOperationsInSingleOplogEntry;
}
this._st = new ShardingTest({
mongos: 1,
mongosOptions,
config: nodesPerConfigRs,
configOptions,
shards: this._numShards,
rs: {nodes: nodesPerShard},
rsOptions,
configReplSetTestOptions,
manualAddShard: true,
});
for (let i = 0; i < this._numShards; ++i) {
const isDonor = i < this._numDonors;
const isRecipient = i >= (this._numShards - this._numRecipients);
assert(isDonor || isRecipient, {
i,
numDonors: this._numDonors,
numRecipients: this._numRecipients,
numShards: this._numShards,
});
// We add a "-donor" or "-recipient" suffix to the shard's name when it has a singular
// role during the resharding process.
let shardName = `shard${i}`;
if (isDonor && !isRecipient) {
shardName += `-donor${i}`;
} else if (isRecipient && !isDonor) {
shardName += `-recipient${i - this._numDonors}`;
}
const shard = this._st[`shard${i}`];
const res = assert.commandWorked(
this._st.s.adminCommand({addShard: shard.host, name: shardName}));
shard.shardName = res.shardAdded;
}
// In order to enable random failovers, initialize Random's seed if it has not already been
// done.
if (!Random.isInitialized()) {
Random.setRandomSeed();
}
}
/** @private */
_donorShards() {
return Array.from({length: this._numDonors}, (_, i) => this._st[`shard${i}`]);
}
get donorShardNames() {
return this._donorShards().map(shard => shard.shardName);
}
/** @private */
_recipientShards() {
return Array
.from({length: this._numRecipients},
(_, i) => this._st[`shard${this._numShards - 1 - i}`])
.reverse();
}
get recipientShardNames() {
return this._recipientShards().map(shard => shard.shardName);
}
get configShardName() {
return "config";
}
/** @private */
_allReplSetTests() {
return [
{shardName: this.configShardName, rs: this._st.configRS},
...Array.from({length: this._numShards}, (_, i) => this._st[`shard${i}`])
];
}
getReplSetForShard(shardName) {
const res = this._allReplSetTests().find(shardInfo => shardInfo.shardName === shardName);
return res.rs;
}
/**
* Shards a non-existing collection using the specified shard key and chunk ranges.
*
* @param chunks - an array of
* {min: <shardKeyValue0>, max: <shardKeyValue1>, shard: <shardName>} objects. The chunks must
* form a partition of the {shardKey: MinKey} --> {shardKey: MaxKey} space.
*/
createShardedCollection(
{ns, shardKeyPattern, chunks, primaryShardName = this.donorShardNames[0]}) {
this._ns = ns;
this._currentShardKey = Object.assign({}, shardKeyPattern);
const sourceCollection = this._st.s.getCollection(ns);
const sourceDB = sourceCollection.getDB();
assert.commandWorked(sourceDB.adminCommand({enableSharding: sourceDB.getName()}));
// mongos won't know about the temporary resharding collection and will therefore assume the
// collection is unsharded. We configure one of the recipient shards to be the primary shard
// for the database so mongos still ends up routing operations to a shard which owns the
// temporary resharding collection.
this._st.ensurePrimaryShard(sourceDB.getName(), primaryShardName);
this._primaryShardName = primaryShardName;
CreateShardedCollectionUtil.shardCollectionWithChunks(
sourceCollection, shardKeyPattern, chunks);
this._sourceCollectionUUID =
getUUIDFromListCollections(sourceDB, sourceCollection.getName());
const sourceCollectionUUIDString = extractUUIDFromObject(this._sourceCollectionUUID);
this._tempNs = `${sourceDB.getName()}.system.resharding.${sourceCollectionUUIDString}`;
return sourceCollection;
}
get tempNs() {
assert.neq(undefined, this._tempNs, "createShardedCollection must be called first");
return this._tempNs;
}
/**
* Reshards an existing collection using the specified new shard key and new chunk ranges.
*
* @param newChunks - an array of
* {min: <shardKeyValue0>, max: <shardKeyValue1>, shard: <shardName>} objects. The chunks must
* form a partition of the {shardKey: MinKey} --> {shardKey: MaxKey} space.
*
* @deprecated prefer using the withReshardingInBackground() method instead.
*/
startReshardingInBackground({newShardKeyPattern, newChunks}) {
this._startReshardingInBackgroundAndAllowCommandFailure({newShardKeyPattern, newChunks},
ErrorCodes.OK);
}
/** @private */
_startReshardingInBackgroundAndAllowCommandFailure({newShardKeyPattern, newChunks},
expectedErrorCode) {
for (let disallowedErrorCode of [ErrorCodes.FailedToSatisfyReadPreference,
ErrorCodes.HostUnreachable,
]) {
assert.neq(
expectedErrorCode,
disallowedErrorCode,
`${ErrorCodeStrings[disallowedErrorCode]} error must never be expected as final` +
" reshardCollection command error response because it indicates mongos gave" +
" up retrying and the client must instead retry");
}
newChunks = newChunks.map(
chunk => ({min: chunk.min, max: chunk.max, recipientShardId: chunk.shard}));
this._newShardKey = Object.assign({}, newShardKeyPattern);
this._pauseCoordinatorBeforeBlockingWritesFailpoints = [];
this._pauseCoordinatorBeforeDecisionPersistedFailpoints = [];
this._pauseCoordinatorBeforeCompletionFailpoints = [];
this._st.forEachConfigServer((configServer) => {
this._pauseCoordinatorBeforeBlockingWritesFailpoints.push(
configureFailPoint(configServer, "reshardingPauseCoordinatorBeforeBlockingWrites"));
this._pauseCoordinatorBeforeDecisionPersistedFailpoints.push(configureFailPoint(
configServer, "reshardingPauseCoordinatorBeforeDecisionPersisted"));
this._pauseCoordinatorBeforeCompletionFailpoints.push(
configureFailPoint(configServer,
"reshardingPauseCoordinatorBeforeCompletion",
{"sourceNamespace": this._ns}));
});
this._commandDoneSignal = new CountDownLatch(1);
this._reshardingThread =
new Thread(function(host, ns, newShardKeyPattern, newChunks, commandDoneSignal) {
const conn = new Mongo(host);
// We allow the client to retry the reshardCollection a large but still finite
// number of times. This is done because the mongos would also return a
// FailedToSatisfyReadPreference error response when the primary of the shard is
// permanently down (e.g. due to a bug causing the server to crash) and it would be
// preferable to not have the test run indefinitely in that situation.
const kMaxNumAttempts = 40; // = [10 minutes / kDefaultFindHostTimeout]
let res;
for (let i = 1; i <= kMaxNumAttempts; ++i) {
res = conn.adminCommand({
reshardCollection: ns,
key: newShardKeyPattern,
_presetReshardedChunks: newChunks,
});
if (res.ok === 1 ||
(res.code !== ErrorCodes.FailedToSatisfyReadPreference &&
res.code !== ErrorCodes.HostUnreachable)) {
commandDoneSignal.countDown();
break;
}
if (i < kMaxNumAttempts) {
print("Ignoring error from mongos giving up retrying" +
` _shardsvrReshardCollection command: ${tojsononeline(res)}`);
}
}
return res;
}, this._st.s.host, this._ns, newShardKeyPattern, newChunks, this._commandDoneSignal);
this._reshardingThread.start();
this._isReshardingActive = true;
}
/**
* Reshards an existing collection using the specified new shard key and new chunk ranges.
*
* @param newChunks - an array of
* {min: <shardKeyValue0>, max: <shardKeyValue1>, shard: <shardName>} objects. The chunks must
* form a partition of the {shardKey: MinKey} --> {shardKey: MaxKey} space.
*
* @param duringReshardingFn - a function which optionally accepts the temporary resharding
* namespace string. It is only guaranteed to be called after mongos has started running the
* reshardCollection command. Callers should use DiscoverTopology.findConnectedNodes() to
* introspect the state of the donor or recipient shards if they need more specific
* synchronization.
*
* @param expectedErrorCode - the expected response code for the reshardCollection command.
*
* @param postCheckConsistencyFn - a function for evaluating additional correctness
* assertions. This function is called in the critical section, after the `reshardCollection`
* command has shuffled data, but before the coordinator persists a decision.
*
* @param postDecisionPersistedFn - a function for evaluating addition assertions after
* the decision has been persisted, but before the resharding operation finishes and returns
* to the client.
*
* @param afterReshardingFn - a function that will be called after the resharding operation
* finishes but before checking the the state post resharding. By the time afterReshardingFn
* is called the temporary resharding collection will either have been dropped or renamed.
*/
withReshardingInBackground({newShardKeyPattern, newChunks},
duringReshardingFn = (tempNs) => {},
{
expectedErrorCode = ErrorCodes.OK,
postCheckConsistencyFn = (tempNs) => {},
postDecisionPersistedFn = () => {},
afterReshardingFn = () => {}
} = {}) {
this._startReshardingInBackgroundAndAllowCommandFailure({newShardKeyPattern, newChunks},
expectedErrorCode);
assert.soon(() => {
const op = this._findReshardingCommandOp();
return op !== undefined || this._commandDoneSignal.getCount() === 0;
}, "failed to find reshardCollection in $currentOp output");
this._callFunctionSafely(() => duringReshardingFn(this._tempNs));
this._checkConsistencyAndPostState(expectedErrorCode,
() => postCheckConsistencyFn(this._tempNs),
() => postDecisionPersistedFn(),
() => afterReshardingFn());
}
/** @private */
_findReshardingCommandOp() {
return this._st.admin
.aggregate([
{$currentOp: {allUsers: true, localOps: true}},
{$match: {"command.reshardCollection": this._ns}},
])
.toArray()[0];
}
/**
* Wrapper around invoking a 0-argument function to make test failures less confusing.
*
* This helper attempts to disable the reshardingPauseCoordinatorBeforeBlockingWrites
* failpoint when an exception is thrown to prevent the mongo shell from hanging (really the
* config server) on top of having a JavaScript error.
*
* This helper attempts to interrupt and join the resharding thread when an exception is thrown
* to prevent the mongo shell from aborting on top of having a JavaScript error.
*
* @private
*/
_callFunctionSafely(fn) {
try {
fn();
} catch (duringReshardingError) {
for (const fp of [...this._pauseCoordinatorBeforeBlockingWritesFailpoints,
...this._pauseCoordinatorBeforeDecisionPersistedFailpoints,
...this._pauseCoordinatorBeforeCompletionFailpoints]) {
try {
fp.off();
} catch (disableFailpointError) {
print(`Ignoring error from disabling the resharding coordinator failpoint: ${
tojsononeline(disableFailpointError)}`);
print(
"The config server primary and the mongo shell along with it are expected" +
" to hang due to the resharding coordinator being left uninterrupted");
}
}
try {
const op = this._findReshardingCommandOp();
if (op !== undefined) {
assert.commandWorked(this._st.admin.killOp(op.opid));
}
try {
this._reshardingThread.join();
} catch (joinError) {
print(`Ignoring error from the resharding thread: ${tojsononeline(joinError)}`);
} finally {
print(`Ignoring response from the resharding thread: ${
tojsononeline(this._reshardingThread.returnData())}`);
}
this._isReshardingActive = false;
} catch (killOpError) {
print(`Ignoring error from sending killOp to the reshardCollection command: ${
tojsononeline(killOpError)}`);
print("The mongo shell is expected to abort due to the resharding thread being" +
" left unjoined");
}
throw duringReshardingError;
}
}
interruptReshardingThread() {
const op = this._findReshardingCommandOp();
assert.neq(undefined, op, "failed to find reshardCollection in $currentOp output");
assert.commandWorked(this._st.admin.killOp(op.opid));
}
/**
* This method can be called with failpoints that block the `reshardCollection` command from
* proceeding to the next stage. This helper returns after either:
*
* 1) The node's waitForFailPoint returns successfully or
* 2) The `reshardCollection` command has returned a response.
*
* The function returns true when we returned because the server reached the failpoint. The
* function returns false when the `reshardCollection` command is no longer running.
* Otherwise the function throws an exception.
*
* @private
*/
_waitForFailPoint(fp) {
assert.soon(
() => {
return this._commandDoneSignal.getCount() === 0 || fp.waitWithTimeout(1000);
},
"Timed out waiting for failpoint to be hit. Failpoint: " + fp.failPointName,
undefined,
// The `waitWithTimeout` command has the server block for an interval of time.
1);
// return true if the `reshardCollection` command is still running.
return this._commandDoneSignal.getCount() === 1;
}
/** @private */
_checkConsistencyAndPostState(expectedErrorCode,
postCheckConsistencyFn = () => {},
postDecisionPersistedFn = () => {},
afterReshardingFn = () => {}) {
// The CSRS primary may have changed as a result of running the duringReshardingFn()
// callback function. The failpoints will only be triggered on the new CSRS primary so we
// detect which node that is here.
const configPrimary = this._st.configRS.getPrimary();
const primaryIdx = this._pauseCoordinatorBeforeBlockingWritesFailpoints.findIndex(
fp => fp.conn.host === configPrimary.host);
// The CSRS secondaries may be going through replication rollback which closes their
// connections to the test client. We wait for any replication rollbacks to complete and for
// the test client to have reconnected so the failpoints can be turned off on all of the
// nodes later on.
this._st.configRS.awaitSecondaryNodes();
this._st.configRS.awaitReplication();
let performCorrectnessChecks = true;
if (expectedErrorCode === ErrorCodes.OK) {
this._callFunctionSafely(() => {
// We use the reshardingPauseCoordinatorBeforeBlockingWrites failpoint so that
// any intervening writes performed on the sharded collection (from when the
// resharding operation had started until now) are eventually applied by the
// recipient shards. We then use the
// reshardingPauseCoordinatorBeforeDecisionPersisted failpoint to wait for all of
// the recipient shards to have applied through all of the oplog entries from all of
// the donor shards.
if (!this._waitForFailPoint(
this._pauseCoordinatorBeforeBlockingWritesFailpoints[primaryIdx])) {
performCorrectnessChecks = false;
}
this._pauseCoordinatorBeforeBlockingWritesFailpoints.forEach(fp => fp.off());
// A resharding command that returned a failure will not hit the "Decision
// Persisted" failpoint. If the command has returned, don't require that the
// failpoint was entered. This ensures that following up by joining the
// `_reshardingThread` will succeed.
if (!this._waitForFailPoint(
this._pauseCoordinatorBeforeDecisionPersistedFailpoints[primaryIdx])) {
performCorrectnessChecks = false;
}
// Don't correctness check the results if the resharding command unexpectedly
// returned.
if (performCorrectnessChecks) {
assert.commandWorked(this._st.s.adminCommand({flushRouterConfig: this._ns}));
this._checkConsistency();
this._checkDocumentOwnership();
postCheckConsistencyFn();
}
this._pauseCoordinatorBeforeDecisionPersistedFailpoints.forEach(fp => fp.off());
postDecisionPersistedFn();
this._pauseCoordinatorBeforeCompletionFailpoints.forEach(fp => fp.off());
});
} else {
this._callFunctionSafely(() => {
this._pauseCoordinatorBeforeBlockingWritesFailpoints.forEach(
fp => this.retryOnceOnNetworkError(fp.off));
postCheckConsistencyFn();
this._pauseCoordinatorBeforeDecisionPersistedFailpoints.forEach(
fp => this.retryOnceOnNetworkError(fp.off));
postDecisionPersistedFn();
this._pauseCoordinatorBeforeCompletionFailpoints.forEach(
fp => this.retryOnceOnNetworkError(fp.off));
});
}
try {
this._reshardingThread.join();
} finally {
this._isReshardingActive = false;
}
if (expectedErrorCode === ErrorCodes.OK) {
assert.commandWorked(this._reshardingThread.returnData());
} else {
assert.commandFailedWithCode(this._reshardingThread.returnData(), expectedErrorCode);
}
// Reaching this line implies the `_reshardingThread` has successfully exited without
// throwing an exception. Assert that we performed all expected correctness checks.
assert(performCorrectnessChecks, {
msg: "Reshard collection succeeded, but correctness checks were not performed.",
expectedErrorCode: expectedErrorCode
});
afterReshardingFn();
this._checkPostState(expectedErrorCode);
}
/** @private */
_checkConsistency() {
// The "available" read concern level won't block this find cmd behind the critical section.
// Tests for resharding are not expected to have unowned documents in the collection being
// resharded.
const nsCursor =
this._st.s.getCollection(this._ns).find().readConcern("available").sort({_id: 1});
const tempNsCursor = this._st.s.getCollection(this._tempNs).find().sort({_id: 1});
const diff = ((diff) => {
return {
docsWithDifferentContents: diff.docsWithDifferentContents.map(
({first, second}) => ({original: first, resharded: second})),
docsExtraAfterResharding: diff.docsMissingOnFirst,
docsMissingAfterResharding: diff.docsMissingOnSecond,
};
})(DataConsistencyChecker.getDiff(nsCursor, tempNsCursor));
assert.eq(diff,
{
docsWithDifferentContents: [],
docsExtraAfterResharding: [],
docsMissingAfterResharding: [],
},
"existing sharded collection " + this._ns +
" and temporary resharding collection " + this._tempNs + " had different" +
" contents");
}
/** @private */
_checkDocumentOwnership() {
// The "available" read concern level won't perform any ownership filtering. Any documents
// which were copied by a recipient shard that are actually owned by a different recipient
// shard would appear as extra documents.
const tempColl = this._st.s.getCollection(this._tempNs);
const localReadCursor = tempColl.find().sort({_id: 1});
const availableReadCursor =
new DBCommandCursor(tempColl.getDB(), assert.commandWorked(tempColl.runCommand("find", {
sort: {_id: 1},
readConcern: {level: "available"},
})));
const diff = ((diff) => {
return {
docsWithDifferentContents: diff.docsWithDifferentContents.map(
({first, second}) => ({local: first, available: second})),
docsFoundUnownedWithReadAvailable: diff.docsMissingOnFirst,
docsNotFoundWithReadAvailable: diff.docsMissingOnSecond,
};
})(DataConsistencyChecker.getDiff(localReadCursor, availableReadCursor));
assert.eq(diff,
{
docsWithDifferentContents: [],
docsFoundUnownedWithReadAvailable: [],
docsNotFoundWithReadAvailable: [],
},
"temporary resharding collection had unowned documents");
}
/** @private */
_checkPostState(expectedErrorCode) {
this._checkCoordinatorPostState(expectedErrorCode);
for (let recipient of this._recipientShards()) {
this._checkRecipientPostState(recipient, expectedErrorCode);
}
for (let donor of this._donorShards()) {
this._checkDonorPostState(donor, expectedErrorCode);
}
}
/** @private */
_checkCoordinatorPostState(expectedErrorCode) {
assert.eq([],
this._st.config.reshardingOperations.find({ns: this._ns}).toArray(),
"expected config.reshardingOperations to be empty, but found it wasn't");
assert.eq([],
this._st.config.collections.find({reshardingFields: {$exists: true}}).toArray(),
"expected there to be no config.collections entries with 'reshardingFields' set");
assert.eq([],
this._st.config.collections.find({allowMigrations: {$exists: true}}).toArray(),
"expected there to be no config.collections entries with 'allowMigrations' set");
assert.eq([],
this._st.config.collections.find({_id: this._tempNs}).toArray(),
"expected there to not be a config.collections entry for the temporary" +
" resharding collection");
assert.eq([],
this._st.config.chunks.find({ns: this._tempNs}).toArray(),
"expected there to not be any config.chunks entry for the temporary" +
" resharding collection");
const collEntry = this._st.config.collections.findOne({_id: this._ns});
assert.neq(null, collEntry, `didn't find config.collections entry for ${this._ns}`);
if (expectedErrorCode === ErrorCodes.OK) {
assert.eq(this._newShardKey,
collEntry.key,
"shard key pattern didn't change despite resharding having succeeded");
assert.neq(this._sourceCollectionUUID,
collEntry.uuid,
"collection UUID didn't change despite resharding having succeeded");
} else {
assert.eq(this._currentShardKey,
collEntry.key,
"shard key pattern changed despite resharding having failed");
assert.eq(this._sourceCollectionUUID,
collEntry.uuid,
"collection UUID changed despite resharding having failed");
}
}
/** @private */
_checkRecipientPostState(recipient, expectedErrorCode) {
assert.eq(
null,
recipient.getCollection(this._tempNs).exists(),
`expected the temporary resharding collection to not exist, but found it does on ${
recipient.shardName}`);
const collInfo = recipient.getCollection(this._ns).exists();
const isAlsoDonor = this._donorShards().includes(recipient);
if (expectedErrorCode === ErrorCodes.OK) {
assert.neq(null,
collInfo,
`collection doesn't exist on ${
recipient.shardName} despite resharding having succeeded`);
assert.neq(this._sourceCollectionUUID,
collInfo.info.uuid,
`collection UUID didn't change on ${
recipient.shardName} despite resharding having succeeded`);
} else if (expectedErrorCode !== ErrorCodes.OK && !isAlsoDonor) {
assert.eq(
null,
collInfo,
`collection exists on ${recipient.shardName} despite resharding having failed`);
}
assert.eq(
[],
recipient.getCollection(`config.localReshardingOperations.recipient.progress_applier`)
.find()
.toArray(),
`config.localReshardingOperations.recipient.progress_applier wasn't cleaned up on ${
recipient.shardName}`);
assert.eq(
[],
recipient
.getCollection(`config.localReshardingOperations.recipient.progress_txn_cloner`)
.find()
.toArray(),
`config.localReshardingOperations.recipient.progress_txn_cloner wasn't cleaned up on ${
recipient.shardName}`);
const sourceCollectionUUIDString = extractUUIDFromObject(this._sourceCollectionUUID);
for (const donor of this._donorShards()) {
assert.eq(null,
recipient
.getCollection(`config.localReshardingOplogBuffer.${
sourceCollectionUUIDString}.${donor.shardName}`)
.exists(),
`expected config.localReshardingOplogBuffer.${sourceCollectionUUIDString}.${
donor.shardName} not to exist on ${recipient.shardName}, but it did.`);
assert.eq(null,
recipient
.getCollection(`config.localReshardingConflictStash.${
sourceCollectionUUIDString}.${donor.shardName}`)
.exists(),
`expected config.localReshardingConflictStash.${sourceCollectionUUIDString}.${
donor.shardName} not to exist on ${recipient.shardName}, but it did.`);
}
const localRecipientOpsNs = "config.localReshardingOperations.recipient";
let res;
assert.soon(
() => {
res = recipient.getCollection(localRecipientOpsNs).find().toArray();
return res.length === 0;
},
() => `${localRecipientOpsNs} document wasn't cleaned up on ${recipient.shardName}: ${
tojson(res)}`);
}
/** @private */
_checkDonorPostState(donor, expectedErrorCode) {
const collInfo = donor.getCollection(this._ns).exists();
const isAlsoRecipient =
this._recipientShards().includes(donor) || donor.shardName === this._primaryShardName;
if (expectedErrorCode === ErrorCodes.OK && !isAlsoRecipient) {
assert(collInfo == null,
`collection exists on ${donor.shardName} despite resharding having succeeded`);
} else if (expectedErrorCode !== ErrorCodes.OK) {
assert.neq(
null,
collInfo,
`collection doesn't exist on ${donor.shardName} despite resharding having failed`);
assert.eq(
this._sourceCollectionUUID,
collInfo.info.uuid,
`collection UUID changed on ${donor.shardName} despite resharding having failed`);
}
const localDonorOpsNs = "config.localReshardingOperations.donor";
let res;
assert.soon(
() => {
res = donor.getCollection(localDonorOpsNs).find().toArray();
return res.length === 0;
},
() => `${localDonorOpsNs} document wasn't cleaned up on ${donor.shardName}: ${
tojson(res)}`);
}
teardown() {
if (this._isReshardingActive) {
this._checkConsistencyAndPostState(ErrorCodes.OK);
}
this._st.stop();
}
/**
* Given the shardName, steps up a secondary (chosen at random) to become the new primary of the
* shard replica set. To force an election on the configsvr rather than a participant shard, use
* shardName = this.configShardName;
*/
stepUpNewPrimaryOnShard(shardName) {
jsTestLog(`ReshardingTestFixture stepping up new primary on shard ${shardName}`);
const replSet = this.getReplSetForShard(shardName);
let originalPrimary = replSet.getPrimary();
let secondaries = replSet.getSecondaries();
while (secondaries.length > 0) {
// Once the primary is terminated/killed/stepped down, write availability is lost. Avoid
// long periods where the replica set doesn't have write availability by trying to step
// up secondaries until one succeeds.
const newPrimaryIdx = Random.randInt(secondaries.length);
const newPrimary = secondaries[newPrimaryIdx];
let res;
try {
res = newPrimary.adminCommand({replSetStepUp: 1});
} catch (e) {
if (!isNetworkError(e)) {
throw e;
}
jsTest.log(
`ReshardingTestFixture got a network error ${tojson(e)} while` +
` attempting to step up secondary ${newPrimary.host}. This is likely due to` +
` the secondary previously having transitioned through ROLLBACK and closing` +
` its user connections. Will retry stepping up the same secondary again`);
res = newPrimary.adminCommand({replSetStepUp: 1});
}
if (res.ok === 1) {
replSet.awaitNodesAgreeOnPrimary();
assert.eq(newPrimary, replSet.getPrimary());
return;
}
jsTest.log(`ReshardingTestFixture failed to step up secondary ${newPrimary.host} and` +
` got error ${tojson(res)}. Will retry on another secondary until all` +
` secondaries have been exhausted`);
secondaries.splice(newPrimaryIdx, 1);
}
jsTest.log(`ReshardingTestFixture failed to step up secondaries, trying to step` +
` original primary back up`);
replSet.stepUp(originalPrimary, {awaitReplicationBeforeStepUp: false});
}
killAndRestartPrimaryOnShard(shardName) {
jsTestLog(`ReshardingTestFixture killing and restarting primary on shard ${shardName}`);
const replSet = this.getReplSetForShard(shardName);
const originalPrimaryConn = replSet.getPrimary();
const SIGKILL = 9;
const opts = {allowedExitCode: MongoRunner.EXIT_SIGKILL};
replSet.restart(originalPrimaryConn, opts, SIGKILL);
replSet.awaitNodesAgreeOnPrimary();
}
shutdownAndRestartPrimaryOnShard(shardName) {
jsTestLog(
`ReshardingTestFixture shutting down and restarting primary on shard ${shardName}`);
const replSet = this.getReplSetForShard(shardName);
const originalPrimaryConn = replSet.getPrimary();
const SIGTERM = 15;
replSet.restart(originalPrimaryConn, {}, SIGTERM);
replSet.awaitNodesAgreeOnPrimary();
}
/**
* @returns the timestamp chosen by the resharding operation for cloning.
*
* Should also be used in tandem with retryableWriteManager when calling this method in a
* jstestfuzzer code for backwards compatibility, like so:
*
* if (reshardingTest.awaitCloneTimestampChosen !== undefined) {
* fetchTimestamp = reshardingTest.awaitCloneTimestampChosen();
* } else {
* fetchTimestamp = retryableWriteManager.awaitFetchTimestampChosen();
* }
*/
awaitCloneTimestampChosen() {
let cloneTimestamp;
assert.soon(() => {
const coordinatorDoc = this._st.config.reshardingOperations.findOne({ns: this._ns});
cloneTimestamp = coordinatorDoc !== null ? coordinatorDoc.cloneTimestamp : undefined;
return cloneTimestamp !== undefined;
});
return cloneTimestamp;
}
/**
* Calls and returns the value from the supplied function.
*
* If a network error is thrown during its execution, then this function will invoke the
* supplied function a second time. This pattern is useful for tolerating network errors which
* result from elections triggered by any of the stepUpNewPrimaryOnShard(),
* killAndRestartPrimaryOnShard(), and shutdownAndRestartPrimaryOnShard() methods.
*
* @param fn - the function to be called.
* @returns the return value from fn.
*/
retryOnceOnNetworkError(fn) {
try {
return fn();
} catch (e) {
if (!isNetworkError(e)) {
throw e;
}
return fn();
}
}
};