Files
mongo/jstests/serial_run/allow_partial_results_with_maxTimeMS_failpoints.js
Steve McClure 1ffbc6c2e9 SERVER-109432: Autofix JS var usage to favor let (#40637)
GitOrigin-RevId: 9674b7db36a0f3f650d39c1e3fb2ad6ff2141cfb
2025-08-28 19:21:01 +00:00

415 lines
15 KiB
JavaScript

/**
* SERVER-57469: Test that the 'allowPartialResults' option to find is respected when used together
* with 'maxTimeMS' and only a subset of the shards provide data before the timeout.
* Uses three methods to simulate MaxTimeMSExpired: failpoints, MongoBridge, and $where + sleep.
*
* This is in the serial_run suite because the tests using MongoBridge and $where + sleep depend on
* certain work being completed before a maxTimeMS timeout.
*
* @tags: [
* requires_sharding,
* requires_replication,
* requires_getmore,
* requires_fcv_62,
* requires_scripting
* ]
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
Random.setRandomSeed();
function getMillis() {
const d = new Date();
return d.getTime();
}
function runtimeMillis(f) {
let start = getMillis();
f();
return getMillis() - start;
}
function isError(res) {
return !res.hasOwnProperty("ok") || !res["ok"];
}
class MultiController {
constructor(controllerList) {
this.controllerList = controllerList;
}
enable() {
for (const c of this.controllerList) {
c.enable();
}
}
disable() {
for (const c of this.controllerList) {
c.disable();
}
}
}
class NeverTimeoutController {
constructor(mongoInstance) {
this.mongoInstance = mongoInstance;
}
enable() {
this.fp = configureFailPoint(this.mongoInstance, "maxTimeNeverTimeOut", {}, "alwaysOn");
}
disable() {
this.fp.off();
}
}
// Set up a 2-shard single-node replicaset cluster with MongoBridge.
const st = new ShardingTest({name: jsTestName(), shards: 2, useBridge: true, rs: {nodes: 1}});
const dbName = "test-SERVER-57469-failpoints";
const collName = "test-SERVER-57469-failpoints-coll";
const coll = st.s0.getDB(dbName)[collName];
function initDb(numSamples, splitPoint) {
coll.drop();
// Use ranged sharding with a specified fraction of the data on the second shard.
st.shardColl(
coll,
{_id: 1}, // shard key
{_id: splitPoint}, // split point
{_id: splitPoint + 1}, // move the chunk to the other shard
);
let bulk = coll.initializeUnorderedBulkOp();
for (let i = 0; i < numSamples; i++) {
bulk.insert({"_id": i});
}
assert.commandWorked(bulk.execute());
}
// Insert some data.
const nDocs = 200;
const splitPoint = Math.max(1, nDocs / 2);
initDb(nDocs, splitPoint);
// We will sometimes use $where expressions to inject delays in processing documents on some shards.
// Maps from shard to a snippet of JS code. This is modified by FindWhereSleepController
let whereExpressions = {};
function whereCode() {
return Object.values(whereExpressions).join("") + "return 1;";
}
function runQueryWithTimeout(doAllowPartialResults, timeout) {
return coll.runCommand({
find: collName,
filter: {$where: whereCode()},
allowPartialResults: doAllowPartialResults,
batchSize: nDocs,
maxTimeMS: timeout,
});
}
const neverTimeout = new MultiController([
new NeverTimeoutController(st.shard0),
new NeverTimeoutController(st.shard1),
new NeverTimeoutController(st.s),
]);
// Set ampleTimeMS to at least 2000ms, plus ten times the basic query runtime.
// This timeout must provide ample time for our queries to run to completion, even on passthrough
// suites with resource contention.
const ampleTimeMS = 2000 + 10 * runtimeMillis(() => runQueryWithTimeout(true, 999999999));
print("ampleTimeMS: " + ampleTimeMS);
// Try to fetch all the data in one batch, with ample time allowed.
function runQuery(doAllowPartialResults) {
return runQueryWithTimeout(doAllowPartialResults, ampleTimeMS);
}
// Simulate mongos timeout during first batch.
// Shards have no results yet, so we do not return partial results.
{
const fpMongos = configureFailPoint(st.s, "maxTimeAlwaysTimeOut", {}, "alwaysOn");
// With 'allowPartialResults: false', if mongos times out then return a timeout error.
assert.commandFailedWithCode(runQuery(false), ErrorCodes.MaxTimeMSExpired);
// With 'allowPartialResults: true', if mongos times out then return a timeout error.
assert.commandFailedWithCode(runQuery(true), ErrorCodes.MaxTimeMSExpired);
fpMongos.off();
}
// Simulate mongos timeout during getMore.
function getMoreMongosTimeout(allowPartialResults, batchSize) {
// Get the first batch. Disable MaxTimeMSExpired during the intial find.
neverTimeout.enable();
const res = assert.commandWorked(
coll.runCommand({
find: collName,
filter: {$where: whereCode()},
allowPartialResults: allowPartialResults,
batchSize: batchSize,
maxTimeMS: ampleTimeMS,
}),
);
neverTimeout.disable();
assert(!res.cursor.hasOwnProperty("partialResultsReturned"));
assert.gt(res.cursor.id, 0);
// Stop mongos and run getMore.
let fpMongos = configureFailPoint(st.s, "maxTimeAlwaysTimeOut", {}, "alwaysOn");
// Run getmores repeatedly until we exhaust the cache on mongos.
// Eventually we should get either a MaxTimeMS error or partial results because a shard is down.
let numReturned = batchSize; // One batch was returned so far.
while (true) {
const res2 = coll.runCommand({getMore: res.cursor.id, collection: collName, batchSize: batchSize});
if (isError(res2)) {
assert.commandFailedWithCode(
res2,
ErrorCodes.MaxTimeMSExpired,
"failure should be due to MaxTimeMSExpired",
);
break;
}
// Results were cached from the first request. As long as GetMore is not called, these
// are returned even if MaxTimeMS expired on mongos.
numReturned += res2.cursor.nextBatch.length;
print(numReturned + " docs returned so far");
assert.neq(numReturned, nDocs, "Got full results even through mongos had MaxTimeMSExpired.");
if (res2.cursor.partialResultsReturned) {
assert(allowPartialResults);
assert.lt(numReturned, nDocs);
break;
}
}
fpMongos.off();
}
// Run the getMore tests with two batch sizes.
// In the first case, we have (splitPoint % batchSizeForGetMore == 0) and the getMores will likely
// exhaust the live shard without requiring any data from the dead shard. When the getMore to
// the dead shard times out, it will be the only unexhausted remote.
// In the second case, choosing (splitPoint % batchSizeForGetMore != 0) the final getMore will be
// requesting data from both shards. Data from the live shard should be returned despite the dead
// shard timing out.
const batchSizesForGetMore = [50, 47];
assert.eq(splitPoint % batchSizesForGetMore[0], 0);
assert.neq(splitPoint % batchSizesForGetMore[1], 0);
assert.lt(batchSizesForGetMore[0], splitPoint);
assert.lt(batchSizesForGetMore[1], splitPoint);
function withEachBatchSize(callback) {
callback(batchSizesForGetMore[0]);
callback(batchSizesForGetMore[1]);
}
function withEachValueOfAllowPartialResults(callback) {
callback(true);
callback(false);
}
withEachValueOfAllowPartialResults((allowPartialResults) =>
withEachBatchSize((batchSize) => getMoreMongosTimeout(allowPartialResults, batchSize)),
);
// Test shard timeouts. These are the scenario that we expect to be likely in practice.
// Test using 3 different types of simulated timeouts, giving slightly different execution paths.
class MaxTimeMSFailpointFailureController {
constructor(mongoInstance) {
this.mongoInstance = mongoInstance;
this.fp = null;
}
enable() {
this.fp = configureFailPoint(this.mongoInstance, "maxTimeAlwaysTimeOut", {}, "alwaysOn");
}
disable() {
this.fp.off();
}
}
class NetworkFailureController {
constructor(shard) {
this.shard = shard;
this.delayTime = Math.round(1.1 * ampleTimeMS);
}
enable() {
// Delay messages from mongos to shard so that mongos will see it as having exceeded
// MaxTimeMS. The shard process is active and receives the request, but the response is
// lost. We delay instead of dropping messages because this lets the shard request proceed
// without connection failure and retry (which has its own driver-controlled timeout,
// typically 15s).
this.shard.getPrimary().delayMessagesFrom(st.s, this.delayTime);
}
disable() {
this.shard.getPrimary().delayMessagesFrom(st.s, 0);
// Allow time for delayed messages to be flushed so that the next request is not delayed.
sleep(this.delayTime);
}
}
class FindWhereSleepController {
constructor(shard) {
this.shard = shard;
}
enable() {
// Add a $where expression to find command that sleeps when processing a document on the
// shard of interest.
let slowDocId = this.shard == st.shard0 ? 0 : splitPoint;
// Offset the slowDocId by at least the getMore batch size so that when testing getMore,
// we quickly return enough documents to serve the first batch without timing out.
slowDocId += Math.max(...batchSizesForGetMore);
const sleepTimeMS = Math.round(1.5 * ampleTimeMS);
whereExpressions[this.shard] = `if (this._id == ${slowDocId}) {sleep(${sleepTimeMS})};`;
}
disable() {
delete whereExpressions[this.shard];
}
}
const shard0Failpoint = new MaxTimeMSFailpointFailureController(st.shard0);
const shard1Failpoint = new MaxTimeMSFailpointFailureController(st.shard1);
const allShardsFailpoint = new MultiController([shard0Failpoint, shard1Failpoint]);
const shard0NetworkFailure = new NetworkFailureController(st.rs0);
const shard1NetworkFailure = new NetworkFailureController(st.rs1);
const allshardsNetworkFailure = new MultiController([shard0NetworkFailure, shard1NetworkFailure]);
const shard0SleepFailure = new FindWhereSleepController(st.shard0);
const shard1SleepFailure = new FindWhereSleepController(st.shard1);
const allShardsSleepFailure = new MultiController([shard0SleepFailure, shard1SleepFailure]);
// Due to the hack with sleepFailures below, this has to be the innermost parameterizing function.
function withEachSingleShardFailure(callback) {
callback(shard0Failpoint);
callback(shard1Failpoint);
callback(shard0NetworkFailure);
callback(shard1NetworkFailure);
// The FindWhereSleepFailureController must be set before the first "find" because that's when
// the $where clause is set.
shard0SleepFailure.enable();
callback(shard0SleepFailure);
shard0SleepFailure.disable();
shard1SleepFailure.enable();
callback(shard1SleepFailure);
shard1NetworkFailure.disable();
}
function withEachAllShardFailure(callback) {
callback(allShardsFailpoint);
callback(allshardsNetworkFailure);
callback(allShardsSleepFailure);
}
function getMoreShardTimeout(allowPartialResults, failureController, batchSize) {
// Get the first batch.
// We are giving this query ample time and no failures are enabled so we don't expect a timeout
// on the initial find. However, to be safe, set failpoints to ensure that MaxTimeMS is
// ignored for the initial find, to avoid failing in cases of resource contention (BF-26792).
neverTimeout.enable();
const res = assert.commandWorked(
coll.runCommand({
find: collName,
filter: {$where: whereCode()},
// FindWhereSleepController only works with getMore if docs are _id-ordered. We use a hint
// instead of sort here because we want to avoid blocking on missing docs -- we want the
// AsyncResultsMerger to return results from the live shard while the other is failing.
hint: {_id: 1},
allowPartialResults: allowPartialResults,
batchSize: batchSize,
maxTimeMS: ampleTimeMS,
}),
);
neverTimeout.disable();
assert.eq(undefined, res.cursor.partialResultsReturned);
assert.gt(res.cursor.id, 0);
// Stop a shard and run getMore.
failureController.enable();
let numReturned = batchSize; // One batch was returned so far.
print(numReturned + " docs returned in the first batch");
while (true) {
// Run getmores repeatedly until we exhaust the cache on mongos.
// Eventually we should get partial results or an error because a shard is down.
const res2 = coll.runCommand({getMore: res.cursor.id, collection: collName, batchSize: batchSize});
if (allowPartialResults) {
assert.commandWorked(res2);
} else {
if (isError(res2)) {
assert.commandFailedWithCode(
res2,
ErrorCodes.MaxTimeMSExpired,
"failure should be due to MaxTimeMSExpired",
);
break;
}
}
numReturned += res2.cursor.nextBatch.length;
print(numReturned + " docs returned so far");
assert.neq(numReturned, nDocs, "Entire collection seemed to be cached by the first find!");
if (res2.cursor.partialResultsReturned) {
if (allowPartialResults) {
assert.lt(numReturned, nDocs);
break;
} else {
assert(false, "Partial results should not have been allowed.");
}
}
}
failureController.disable();
}
shard0SleepFailure.enable();
withEachValueOfAllowPartialResults((allowPartialResults) =>
withEachBatchSize((batchSize) =>
withEachSingleShardFailure((failure) => getMoreShardTimeout(allowPartialResults, failure, batchSize)),
),
);
// With 'allowPartialResults: true', if a shard times out on the first batch then return
// partial results.
function partialResultsTrueFirstBatch(failureController) {
failureController.enable();
const res = assert.commandWorked(runQuery(true));
assert(res.cursor.partialResultsReturned);
assert.eq(res.cursor.firstBatch.length, nDocs / 2);
assert.eq(0, res.cursor.id);
failureController.disable();
}
withEachSingleShardFailure(partialResultsTrueFirstBatch);
// With 'allowPartialResults: false', if one shard times out then return a timeout error.
function partialResultsFalseOneFailure(failureController) {
failureController.enable();
assert.commandFailedWithCode(runQuery(false), ErrorCodes.MaxTimeMSExpired);
failureController.disable();
}
withEachSingleShardFailure(partialResultsFalseOneFailure);
// With 'allowPartialResults: false', if both shards time out then return a timeout error.
function allowPartialResultsFalseAllFailed(failureController) {
failureController.enable();
assert.commandFailedWithCode(runQuery(false), ErrorCodes.MaxTimeMSExpired);
failureController.disable();
}
withEachAllShardFailure(allowPartialResultsFalseAllFailed);
// With 'allowPartialResults: true', if both shards time out then return empty "partial" results.
function allowPartialResultsTrueAllFailed(failureController) {
failureController.enable();
const res = assert.commandWorked(runQuery(true));
assert(res.cursor.partialResultsReturned);
assert.eq(0, res.cursor.id);
assert.eq(res.cursor.firstBatch.length, 0);
failureController.disable();
}
withEachAllShardFailure(allowPartialResultsTrueAllFailed);
st.stop();