Files
mongo/jstests/sharding/config_shard_catalog_cache_refresh.js
David Chen 738852b2e4 SERVER-115970: exclude config_shard_catalog_cache_refresh.js from suites with stepdowns (#45623)
GitOrigin-RevId: 7391531d04dd6120d8d1ad7b88c9356f1525ef9c
2025-12-26 22:06:13 +00:00

127 lines
4.8 KiB
JavaScript

/**
* Tests that a config shard node performing an aggregation on 'config.*' to refresh the catalog
* cache, where this aggregation ends up targeting the requesting node itself, does not end up in a
* deadlock.
*
* @tags: [
* requires_sharding,
* does_not_support_stepdowns,
* ]
*/
import {configureFailPoint, kDefaultWaitForFailPointTimeout} from "jstests/libs/fail_point_util.js";
import {funWithArgs} from "jstests/libs/parallel_shell_helpers.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
function forceReadPreferenceNearestToTargetPrimary(replSet) {
// Internal aggregations on the config server use 'readPreference: nearest'. For the issue to
// reproduce, server selection must end up targeting the primary.
//
// SDAM server selection will discard anything with a round trip time (RTT) above
// defaultLocalThresholdMillis(15). RTT is computed from the hello command on init, but updated
// with ping responses. The shard keeps a single Replica Set Monitor, so delaying the 'hello'
// command is unhelpful. Instead, we override the RTT from ping, and wait until at least a ping
// for 3 nodes has been done (even if we only override 2 of them).
//
// Turn on serverPingMonitorSetRTT on primary to simulate slow 'ping' from secondaries.
const kDelayMicros = 200 * 1000;
let hosts = {};
replSet.getSecondaries().forEach((sec) => {
// Current RTT accounts for kRttAlpha(0.2) of new RTT.
// Use a value large enough so the computation is guaranteed to exceed
// defaultLocalThresholdMillis in a single iteration.
hosts[sec.host] = kDelayMicros;
});
const monitorDelayFailPoint = configureFailPoint(replSet.getPrimary(), "serverPingMonitorSetRTT", hosts);
// Wait for a refresh on each node.
monitorDelayFailPoint.wait({timesEntered: 3});
return () => {
monitorDelayFailPoint.off();
};
}
const logLevel = tojson({sharding: {shardingCatalogRefresh: 1}, command: 2, query: 1});
let st = new ShardingTest({
shards: 1,
rs: {
nodes: [
{
/* primary */
},
{/* secondary */ rsConfig: {priority: 0}},
{/* secondary */ rsConfig: {priority: 0}},
],
setParameter: {logComponentVerbosity: logLevel, heartBeatFrequencyMs: 1000},
},
configShard: true,
other: {enableBalancer: false},
});
const dbName = "test";
function nsFor(coll) {
return dbName + "." + coll;
}
// The number of parallel queries should be at least as large as the amount of threads in the thread
// pool of the CatalogCache.
const nParallelQueries = 6;
let collections = Array.from({length: nParallelQueries}, (_, i) => "coll_" + i);
// Shard collections and insert a document.
collections.forEach((coll) => {
assert.commandWorked(st.s.adminCommand({shardCollection: nsFor(coll), key: {_id: 1}}));
st.s.getDB(dbName)[coll].insertOne({x: 1});
});
// createCollectionCoordinator issues a fire and forget routing table cache update, which can
// complete after 'flushRouterConfig', messing our test setup. Perform a find operation to ensure
// the refresh is finished.
collections.forEach((coll) => {
assert.commandWorked(st.s.getDB(dbName).runCommand({find: coll}));
});
const configShard = st.shard0;
// Clear catalog cache on shard.
assert.commandWorked(configShard.adminCommand({flushRouterConfig: 1}));
const cleanUpFn = forceReadPreferenceNearestToTargetPrimary(st.rs0);
// Block CollectionCache lookups on dbName.
const cacheFP = configureFailPoint(configShard, "blockCollectionCacheLookup", {nss: dbName});
// Issue parallel queries to all test collections.
let parallelShellArr = collections.map((coll) => {
return startParallelShell(
funWithArgs(
function (dbName, collName) {
// We want the query to happen on the primary.
db.getMongo().setReadPref("primary");
const testDB = db.getSiblingDB(dbName);
// Use a $unionWith to cause the shard to perform some routing internally. Otherwise
// there is no need for a refresh on the shard.
assert.commandWorked(
testDB.runCommand({aggregate: collName, pipeline: [{$unionWith: {coll: collName}}], cursor: {}}),
);
jsTestLog("Finished query for " + collName);
},
dbName,
coll,
),
st.s.port,
);
});
// Wait for all queries to refresh. Each thread hits the failpoint twice: once to test the
// condition, and then once to wait on it.
cacheFP.wait(kDefaultWaitForFailPointTimeout, collections.length * 2);
// Unblock threads.
cacheFP.off();
jsTestLog("Join parallel queries");
parallelShellArr.forEach((parallelShell) => parallelShell());
cleanUpFn();
st.stop();