Files
mongo/jstests/sharding/kill_pinned_cursor.js

271 lines
12 KiB
JavaScript

/**
* Run a query on a sharded cluster where one of the shards hangs. Running killCursors on the mongos
* should always succeed.
*
* Uses getMore to pin an open cursor.
* @tags: [
* requires_getmore,
* ]
*/
(function() {
"use strict";
load("jstests/libs/curop_helpers.js"); // for waitForCurOpByFailPoint().
// This test manually simulates a session, which is not compatible with implicit sessions.
TestData.disableImplicitSessions = true;
const kFailPointName = "waitAfterPinningCursorBeforeGetMoreBatch";
const kFailpointOptions = {
shouldCheckForInterrupt: true
};
const kCommandCommentString = "kill_pinned_cursor_js_test";
const st = new ShardingTest({shards: 2});
const kDBName = "test";
const mongosDB = st.s.getDB(kDBName);
const shard0DB = st.shard0.getDB(kDBName);
const shard1DB = st.shard1.getDB(kDBName);
let coll = mongosDB.jstest_kill_pinned_cursor;
coll.drop();
for (let i = 0; i < 10; i++) {
assert.commandWorked(coll.insert({_id: i}));
}
st.shardColl(coll, {_id: 1}, {_id: 5}, {_id: 6}, kDBName, false);
st.ensurePrimaryShard(kDBName, st.shard0.name);
// The startParallelShell function will take the string it's given and serialize it into a
// string. This means that we can't pass it functions which capture variables. Instead we use
// the trick below, by putting the values for the variables we'd like to capture inside the
// string. Kudos to Dave Storch for coming up with this idea.
function makeParallelShellFunctionString(cursorId, getMoreErrCodes, useSession, sessionId) {
let code = `const cursorId = ${cursorId.toString()};`;
code += `const kDBName = "${kDBName}";`;
code += `let collName = "${coll.getName()}";`;
code += `const useSession = ${useSession};`;
TestData.getMoreErrCodes = getMoreErrCodes;
if (useSession) {
TestData.sessionId = sessionId;
}
const runGetMore = function() {
let getMoreCmd = {getMore: cursorId, collection: collName, batchSize: 4};
if (useSession) {
getMoreCmd.lsid = TestData.sessionId;
}
// We expect that the operation will get interrupted and fail.
assert.commandFailedWithCode(db.runCommand(getMoreCmd), TestData.getMoreErrCodes);
if (useSession) {
assert.commandWorked(db.adminCommand({endSessions: [TestData.sessionId]}));
}
};
code += `(${runGetMore.toString()})();`;
return code;
}
// Tests that the various cursors involved in a sharded query can be killed, even when pinned.
//
// Sets up a sharded cursor, opens a mongos cursor, and uses failpoints to cause the mongos
// cursor to hang due to getMore commands hanging on each of the shards. Then invokes
// 'killFunc', and verifies the cursors on the shards and the mongos cursor get cleaned up.
//
// 'getMoreErrCodes' are the error codes with which we expect the getMore to fail (e.g. a
// killCursors command should cause getMore to fail with "CursorKilled", but killOp should cause
// a getMore to fail with "Interrupted").
function testShardedKillPinned(
{killFunc: killFunc, getMoreErrCodes: getMoreErrCodes, useSession: useSession}) {
let getMoreJoiner = null;
let cursorId;
let sessionId;
try {
// Set up the mongods to hang on a getMore request. ONLY set the failpoint on the
// mongods. Setting the failpoint on the mongos will only cause it to spin, and not
// actually send any requests out.
assert.commandWorked(shard0DB.adminCommand(
{configureFailPoint: kFailPointName, mode: "alwaysOn", data: kFailpointOptions}));
assert.commandWorked(shard1DB.adminCommand(
{configureFailPoint: kFailPointName, mode: "alwaysOn", data: kFailpointOptions}));
// Run a find against mongos. This should open cursors on both of the shards.
let findCmd = {find: coll.getName(), batchSize: 2, comment: kCommandCommentString};
if (useSession) {
// Manually start a session so it can be continued from inside a parallel shell.
sessionId = assert.commandWorked(mongosDB.adminCommand({startSession: 1})).id;
findCmd.lsid = sessionId;
}
let cmdRes = mongosDB.runCommand(findCmd);
assert.commandWorked(cmdRes);
cursorId = cmdRes.cursor.id;
assert.neq(cursorId, NumberLong(0));
const parallelShellFn =
makeParallelShellFunctionString(cursorId, getMoreErrCodes, useSession, sessionId);
getMoreJoiner = startParallelShell(parallelShellFn, st.s.port);
// Wait until we know the mongod cursors are pinned.
const curOpFilter = {"command.comment": kCommandCommentString};
waitForCurOpByFailPoint(shard0DB, coll.getFullName(), kFailPointName, curOpFilter);
waitForCurOpByFailPoint(shard1DB, coll.getFullName(), kFailPointName, curOpFilter);
// Use the function provided by the caller to kill the sharded query.
killFunc(cursorId);
// The getMore should finish now that we've killed the cursor (even though the failpoint
// is still enabled).
getMoreJoiner();
getMoreJoiner = null;
// By now, the getMore run against the mongos has returned with an indication that the
// cursor has been killed. Verify that the cursor is really gone by running a
// killCursors command, and checking that the cursor is reported as "not found".
let killRes = mongosDB.runCommand({killCursors: coll.getName(), cursors: [cursorId]});
assert.commandWorked(killRes);
assert.eq(killRes.cursorsAlive, []);
assert.eq(killRes.cursorsNotFound, [cursorId]);
assert.eq(killRes.cursorsUnknown, []);
// Eventually the cursors on the mongods should also be cleaned up. They should be
// killed by mongos when the mongos cursor gets killed.
function logActiveOpsAndIdleCursors(shardDB) {
return () => "assert.soon failed: " +
tojson(shardDB.getSiblingDB("admin")
.aggregate([{$currentOp: {idleCursors: true}}])
.toArray());
}
assert.soon(() => shard0DB.getSiblingDB("admin")
.aggregate([{$currentOp: {idleCursors: true}}, {$match: curOpFilter}])
.itcount() == 0,
logActiveOpsAndIdleCursors(shard0DB));
assert.soon(() => shard1DB.getSiblingDB("admin")
.aggregate([{$currentOp: {idleCursors: true}}, {$match: curOpFilter}])
.itcount() == 0,
logActiveOpsAndIdleCursors(shard1DB));
} finally {
assert.commandWorked(
shard0DB.adminCommand({configureFailPoint: kFailPointName, mode: "off"}));
assert.commandWorked(
shard1DB.adminCommand({configureFailPoint: kFailPointName, mode: "off"}));
if (getMoreJoiner) {
getMoreJoiner();
}
}
}
for (let useSession of [true, false]) {
// Test that running 'killCursors' against a pinned mongos cursor (with pinned mongod
// cursors) correctly cleans up all of the involved cursors.
testShardedKillPinned({
killFunc: function(mongosCursorId) {
// Run killCursors against the mongos cursor. Verify that the cursor is reported as
// killed successfully, and does not hang or return a "CursorInUse" error.
let cmdRes =
mongosDB.runCommand({killCursors: coll.getName(), cursors: [mongosCursorId]});
assert.commandWorked(cmdRes);
assert.eq(cmdRes.cursorsKilled, [mongosCursorId]);
assert.eq(cmdRes.cursorsAlive, []);
assert.eq(cmdRes.cursorsNotFound, []);
assert.eq(cmdRes.cursorsUnknown, []);
},
getMoreErrCodes: ErrorCodes.CursorKilled,
useSession: useSession
});
// Test that running killOp against one of the cursors pinned on mongod causes all involved
// cursors to be killed.
testShardedKillPinned({
// This function ignores the mongos cursor id, since it instead uses currentOp to
// obtain an op id to kill.
killFunc: function() {
let currentGetMoresArray = shard0DB.getSiblingDB("admin")
.aggregate([
{$currentOp: {}},
{
$match: {
"command.getMore": {$exists: true},
"command.comment": kCommandCommentString
}
}
])
.toArray();
assert.eq(1, currentGetMoresArray.length, currentGetMoresArray);
let currentGetMore = currentGetMoresArray[0];
let killOpResult = shard0DB.killOp(currentGetMore.opid);
assert.commandWorked(killOpResult);
},
getMoreErrCodes: ErrorCodes.Interrupted,
useSession: useSession
});
// Test that running killCursors against one of the cursors pinned on mongod causes all
// involved cursors to be killed.
testShardedKillPinned({
// This function ignores the mongos cursor id, since it instead uses currentOp to
// obtain the cursor id of one of the shard cursors.
killFunc: function() {
let currentGetMoresArray = shard0DB.getSiblingDB("admin")
.aggregate([
{$currentOp: {}},
{
$match: {
"command.getMore": {$exists: true},
"command.comment": kCommandCommentString
}
}
])
.toArray();
assert.eq(1, currentGetMoresArray.length, currentGetMoresArray);
let currentGetMore = currentGetMoresArray[0];
let shardCursorId = currentGetMore.command.getMore;
let cmdRes =
shard0DB.runCommand({killCursors: coll.getName(), cursors: [shardCursorId]});
assert.commandWorked(cmdRes);
assert.eq(cmdRes.cursorsKilled, [shardCursorId]);
assert.eq(cmdRes.cursorsAlive, []);
assert.eq(cmdRes.cursorsNotFound, []);
assert.eq(cmdRes.cursorsUnknown, []);
},
getMoreErrCodes: ErrorCodes.CursorKilled,
useSession: useSession
});
}
// Test that running killSessions on the session which is running the getMore causes the
// cursor to be killed.
testShardedKillPinned({
// This function ignores the mongos cursor id, since it instead uses listLocalSessions
// to obtain the session id of the session running the getMore.
killFunc: function() {
// Must sort by 'lastUse' because there may be sessions left over on the server from
// the previous runs. We will only call killSessions on the most recently used one.
const localSessions = mongosDB
.aggregate([
{$listLocalSessions: {allUsers: true}},
{$sort: {"lastUse": -1}},
])
.toArray();
const sessionUUID = localSessions[0]._id.id;
assert.commandWorked(mongosDB.runCommand({killSessions: [{id: sessionUUID}]}));
},
// Killing a session on mongos kills all matching remote cursors (through KillCursors) then
// all matching local operations (through KillOp), so the getMore can fail with either
// CursorKilled or Interrupted depending on which response is returned first.
getMoreErrCodes: [ErrorCodes.CursorKilled, ErrorCodes.Interrupted],
useSession: true,
});
st.stop();
})();