Compare commits

...

9 Commits
r5.3.2 ... v5.3

Author SHA1 Message Date
David Bradford
08077e4d70 SERVER-67405: Fix is_patch lookup for build_variant_gen
(cherry picked from commit 1260ae9d64)
2022-07-12 13:47:15 +00:00
Tommaso Tocci
4f9e10bb30 SERVER-57519 Make ARS use causally consistent ShardRegistry::getShard() function
(cherry picked from commit 527dfe85a1)
2022-07-11 21:46:35 +00:00
Henrik Edin
894acad323 SERVER-64307 Remove BSONColumn benchmarks from running on Evergreen
(cherry picked from commit 3928c9d541)
2022-06-09 18:41:13 +00:00
Henrik Edin
a86bb832e1 SERVER-65100 Increase block size for BSONColumn to be BSONObjMaxUserSize
(cherry picked from commit 0c2e3ff947)
2022-06-08 19:52:08 +00:00
Max Hirschhorn
3276296336 SERVER-66618 Wait in test for resharding coordinator to persist abort.
Fixes an issue in the resharding_coordinator_recovers_abort_decision.js
test where the reshardingPauseCoordinatorBeforeBlockingWrites failpoint
is being released too early and allowing the resharding coordinator to
decide to commit the resharding operation instead.

(cherry picked from commit 18ec837622)
2022-06-07 20:28:11 +00:00
Vlad Rachev
de31372c6f SERVER-66955 Remove JSON.send usage in sys-perf and performance
(cherry picked from commit 421855380204e5a9dcde2b83cccef8e6f75e3a96)
(cherry picked from commit 49e8e9786b5a074e3c28793e1ab7c425453cd14f)
2022-06-06 17:12:52 +00:00
Andrew Shuvalov
814159344a SERVER-66843 Use defensive programming in DeadlineFuture destructor
(cherry picked from commit cae77d5066)
2022-06-03 17:21:59 +00:00
Tommaso Tocci
58d6f304e9 SERVER-66860 FSM tests should not reuse same database names
(cherry picked from commit d88a892d5b)
2022-06-01 08:52:30 +00:00
Matt Kneiser
64ae4a253e SERVER-66769 Add Windows support to spawnhost setup script
(cherry picked from commit ad379f1338)
2022-05-31 17:34:02 +00:00
17 changed files with 318 additions and 160 deletions

View File

@@ -59,7 +59,7 @@ class EvgExpansions(BaseModel):
build_id: str
build_variant: str
exec_timeout_secs: Optional[int] = None
is_patch: Optional[bool]
is_patch: Optional[str]
project: str
max_tests_per_suite: Optional[int] = 100
max_sub_suites: Optional[int] = 5
@@ -83,10 +83,21 @@ class EvgExpansions(BaseModel):
def get_max_sub_suites(self) -> int:
"""Get the max_sub_suites to use."""
if not self.is_patch:
if not self.determine_is_patch():
return self.mainline_max_sub_suites
return self.max_sub_suites
def determine_is_patch(self) -> bool:
"""
Determine if expansions indicate whether the script is being run in a patch build.
In a patch build, the `is_patch` expansion will be the string value of "true". In a
non-patch setting, it will not exist, or be an empty string.
:return: True if task is being run in a patch build.
"""
return self.is_patch is not None and self.is_patch.lower() == "true"
def build_suite_split_config(self, start_date: datetime,
end_date: datetime) -> SuiteSplitConfig:
"""
@@ -113,7 +124,7 @@ class EvgExpansions(BaseModel):
"""
return GenTaskOptions(
create_misc_suite=True,
is_patch=self.is_patch,
is_patch=self.determine_is_patch(),
generated_config_dir=GENERATED_CONFIG_DIR,
use_default_timeouts=False,
timeout_secs=self.timeout_secs,

View File

@@ -2,15 +2,69 @@
cd $HOME # workaround EVG-12829
# Communicate to users that logged in before the script started that nothing is ready.
wall "The setup_spawnhost_coredump script has just started setting up the debugging environment."
unameOut=$(uname -s)
case "${unameOut}" in
Linux*) machine=Linux;;
Darwin*) machine=Mac;;
CYGWIN*) machine=Cygwin;;
*) machine="UNKNOWN:${unameOut}"
esac
# Write this file that gets cat'ed on login to communicate to users logging in if this setup script is still running.
echo '+-----------------------------------------------------------------+' > ~/.setup_spawnhost_coredump_progress
echo '| The setup script is still setting up data files for inspection. |' >> ~/.setup_spawnhost_coredump_progress
echo '+-----------------------------------------------------------------+' >> ~/.setup_spawnhost_coredump_progress
if [[ "${machine}" = "Cygwin" ]]; then
out_dir="/cygdrive/c/setup_script_output.txt"
desktop_dir="/cygdrive/c/Users/Administrator/Desktop"
cat >> ~/.profile <<EOF
{
date
env
echo "----------------------"
echo -e "\n=> Setting _NT_SOURCE_PATH environment variable for debuggers to pick up source files."
src_dir_hash=$(readlink -f /cygdrive/z/data/mci/source-*)
full_src_dir="${src_dir_hash}/src"
echo "Source Path: [${full_src_dir}]"
set -x;
setx _NT_SOURCE_PATH "${full_src_dir}"
{ set +x; } 2>/dev/null
echo -e "\n=> Setting _NT_SYMBOL_PATH environment variable for debuggers to pick up the symbols."
sym_parent_dir=$(readlink -f /cygdrive/z/data/mci/artifacts-*dist_test_debug)
sym_dir=$(readlink -f ${sym_parent_dir}/debugsymbols-mongodb*zip)
sym_extracted_dir="${sym_parent_dir}/extracted_symbols"
full_sym_dir="${sym_extracted_dir}/dist-test/bin"
echo "Symbols Dir: [${full_sym_dir}]"
echo -e "\n=> Extracting Symbol files."
set -x;
mkdir ${sym_extracted_dir}
unzip -n ${sym_dir} -d ${sym_extracted_dir}
setx _NT_SYMBOL_PATH "${full_sym_dir};srv*;"
{ set +x; } 2>/dev/null
echo -e "\n=> Extracting Core Dump to Desktop."
full_dump_dir=$(readlink -f /cygdrive/z/data/mci/artifacts-* | grep -v dist_test)
full_dump_parent_dir=$(readlink -f ${full_dump_dir}/mongo-coredumps*tgz)
extracted_dump_dir="${full_dump_dir}/extracted_dump"
set -x;
mkdir ${extracted_dump_dir}
tar -xzvf ${full_dump_parent_dir} -C ${extracted_dump_dir}
cp ${extracted_dump_dir}/* ${desktop_dir}
{ set +x; } 2>/dev/null
echo "Copied to Desktop."
} &> ${out_dir}
cp ${out_dir} ${desktop_dir}
else
# Communicate to users that logged in before the script started that nothing is ready.
wall "The setup_spawnhost_coredump script has just started setting up the debugging environment."
# Write this file that gets cat'ed on login to communicate to users logging in if this setup script is still running.
echo '+-----------------------------------------------------------------------------------+' > ~/.setup_spawnhost_coredump_progress
echo '| The setup script is still setting up data files for inspection on a [${machine}] host. |' >> ~/.setup_spawnhost_coredump_progress
echo '+-----------------------------------------------------------------------------------+' >> ~/.setup_spawnhost_coredump_progress
cat >> ~/.profile <<EOF
cat ~/.setup_spawnhost_coredump_progress
# Coredumps generated by a toolchain built mongodb can be problematic when examined with the system
# gdb.
@@ -33,54 +87,54 @@ echo "To examine a core dump, type 'gdb ./<binary> ./<core file>'"
cat ~/.setup_spawnhost_coredump_progress
EOF
export PATH=/opt/mongodbtoolchain/gdb/bin:$PATH
echo 'if [ -f ~/.profile ]; then
export PATH=/opt/mongodbtoolchain/gdb/bin:$PATH
echo 'if [ -f ~/.profile ]; then
. ~/.profile
fi' >> .bash_profile
# Make a directory on the larger EBS volume. Soft-link it under the home directory. The smaller home
# volume can have trouble particularly with coredumps from sharded timeouts.
mkdir /data/debug
ln -s /data/debug .
cd debug
# Make a directory on the larger EBS volume. Soft-link it under the home directory. The smaller home
# volume can have trouble particularly with coredumps from sharded timeouts.
mkdir /data/debug
ln -s /data/debug .
cd debug
# As the name suggests, pretty printers. Primarily for boost::optional<T>
git clone git@github.com:ruediger/Boost-Pretty-Printer.git &
# As the name suggests, pretty printers. Primarily for boost::optional<T>
git clone git@github.com:ruediger/Boost-Pretty-Printer.git &
# Discover and unarchive necessary files and source code. This will put mongo binaries and their
# partner .debug files in the same `debug/bin` directory. The `bin` directory will later be symbolic
# linked into the top-level (`debug`) directory. Shared library files and their debug symbols will
# be dumped into a `debug/lib` directory for tidiness. The mongo `<reporoot>/src/` directory is soft
# linked as `debug/src`. The .gdbinit file assumes gdb is being run from the `debug` directory.
BIN_ARCHIVE=`ls /data/mci/artifacts-*archive_dist_test*/mongo-*.tgz`
tar --wildcards --strip-components=1 -xzf $BIN_ARCHIVE '*/bin/mongod' '*/bin/mongos' '*/bin/mongo' '*/bin/mongobridge' &
tar --wildcards --strip-components=1 -xzf $BIN_ARCHIVE '*/lib/*' &
DBG_ARCHIVE=`ls /data/mci/artifacts-*archive_dist_test_debug/debugsymbols-*.tgz`
tar --wildcards --strip-components=1 -xzf $DBG_ARCHIVE '*/bin/mongod.debug' '*/bin/mongos.debug' '*/bin/mongo.debug' '*/bin/mongobridge.debug' &
tar --wildcards --strip-components=1 -xzf $DBG_ARCHIVE '*/lib/*' &
UNITTEST_ARCHIVE=`ls /data/mci/artifacts-*run_unittests/mongo-unittests-*.tgz`
tar --wildcards --strip-components=0 -xzf $UNITTEST_ARCHIVE 'bin/*' &
tar --wildcards -xzf $UNITTEST_ARCHIVE 'lib/*' &
# Discover and unarchive necessary files and source code. This will put mongo binaries and their
# partner .debug files in the same `debug/bin` directory. The `bin` directory will later be symbolic
# linked into the top-level (`debug`) directory. Shared library files and their debug symbols will
# be dumped into a `debug/lib` directory for tidiness. The mongo `<reporoot>/src/` directory is soft
# linked as `debug/src`. The .gdbinit file assumes gdb is being run from the `debug` directory.
BIN_ARCHIVE=`ls /data/mci/artifacts-*archive_dist_test*/mongo-*.tgz`
tar --wildcards --strip-components=1 -xzf $BIN_ARCHIVE '*/bin/mongod' '*/bin/mongos' '*/bin/mongo' '*/bin/mongobridge' &
tar --wildcards --strip-components=1 -xzf $BIN_ARCHIVE '*/lib/*' &
DBG_ARCHIVE=`ls /data/mci/artifacts-*archive_dist_test_debug/debugsymbols-*.tgz`
tar --wildcards --strip-components=1 -xzf $DBG_ARCHIVE '*/bin/mongod.debug' '*/bin/mongos.debug' '*/bin/mongo.debug' '*/bin/mongobridge.debug' &
tar --wildcards --strip-components=1 -xzf $DBG_ARCHIVE '*/lib/*' &
UNITTEST_ARCHIVE=`ls /data/mci/artifacts-*run_unittests/mongo-unittests-*.tgz`
tar --wildcards --strip-components=0 -xzf $UNITTEST_ARCHIVE 'bin/*' &
tar --wildcards -xzf $UNITTEST_ARCHIVE 'lib/*' &
SRC_DIR=`find /data/mci/ -maxdepth 1 | grep source`
ln -s $SRC_DIR/.gdbinit .
ln -s $SRC_DIR/src src
ln -s $SRC_DIR/buildscripts buildscripts
SRC_DIR=`find /data/mci/ -maxdepth 1 | grep source`
ln -s $SRC_DIR/.gdbinit .
ln -s $SRC_DIR/src src
ln -s $SRC_DIR/buildscripts buildscripts
# Install pymongo to get the bson library for pretty-printers.
/opt/mongodbtoolchain/v3/bin/pip3 install -r $SRC_DIR/etc/pip/dev-requirements.txt &
# Install pymongo to get the bson library for pretty-printers.
/opt/mongodbtoolchain/v3/bin/pip3 install -r $SRC_DIR/etc/pip/dev-requirements.txt &
COREDUMP_ARCHIVE=`ls /data/mci/artifacts-*/mongo-coredumps-*.tgz`
tar -xzf $COREDUMP_ARCHIVE &
echo "Waiting for background processes to complete."
wait
COREDUMP_ARCHIVE=`ls /data/mci/artifacts-*/mongo-coredumps-*.tgz`
tar -xzf $COREDUMP_ARCHIVE &
echo "Waiting for background processes to complete."
wait
# Symbolic linking all of the executable files is sufficient for `gdb ./mongod ./dump_mongod.core`
# to succeed. This inadvertantly also links in the ".debug" files which is unnecessary, but
# harmless. gdb expects the .debug files to live adjacent to the physical binary.
ln -s bin/* ./
# Symbolic linking all of the executable files is sufficient for `gdb ./mongod ./dump_mongod.core`
# to succeed. This inadvertantly also links in the ".debug" files which is unnecessary, but
# harmless. gdb expects the .debug files to live adjacent to the physical binary.
ln -s bin/* ./
cat >> ~/.gdbinit <<EOF
cat >> ~/.gdbinit <<EOF
set auto-load safe-path /
set solib-search-path ./lib/
set pagination off
@@ -96,10 +150,11 @@ boost.register_printers()
end
EOF
echo "dir $HOME/debug" >> ~/.gdbinit
echo "dir $HOME/debug" >> ~/.gdbinit
# Empty out the progress script that warns users about the set script still running when users log in.
echo "" > ~/.setup_spawnhost_coredump_progress
# Alert currently logged in users that this setup script has completed. Logging back in will ensure any
# paths/environment variables will be set as intended.
wall "The setup_spawnhost_coredump script has completed, please relogin to ensure the right environment variables are set."
# Empty out the progress script that warns users about the set script still running when users log in.
echo "" > ~/.setup_spawnhost_coredump_progress
# Alert currently logged in users that this setup script has completed. Logging back in will ensure any
# paths/environment variables will be set as intended.
wall "The setup_spawnhost_coredump script has completed, please relogin to ensure the right environment variables are set."
fi

View File

@@ -87,7 +87,7 @@ def build_mock_orchestrator(build_expansions=None, task_def_list=None, build_tas
class TestEvgExpansions(unittest.TestCase):
def test_get_max_sub_suites_should_use_patch_value_in_patches(self):
evg_expansions = under_test.EvgExpansions(
is_patch=True,
is_patch="true",
max_sub_suites=5,
mainline_max_sub_suites=1,
build_id="build_id",
@@ -102,7 +102,7 @@ class TestEvgExpansions(unittest.TestCase):
def test_get_max_sub_suites_should_use_mainline_value_in_non_patches(self):
evg_expansions = under_test.EvgExpansions(
is_patch=False,
is_patch="false",
max_sub_suites=5,
mainline_max_sub_suites=1,
build_id="build_id",
@@ -133,6 +133,60 @@ class TestEvgExpansions(unittest.TestCase):
evg_expansions.mainline_max_sub_suites)
class TestDetermineIsPatch(unittest.TestCase):
def test_is_patch_is_none_should_return_false(self):
evg_expansions = under_test.EvgExpansions(
is_patch=None,
build_id="build_id",
build_variant="build variant",
project="project",
revision="abc123",
task_name="task name",
task_id="task_314",
)
self.assertFalse(evg_expansions.determine_is_patch())
def test_is_patch_is_false_should_return_false(self):
evg_expansions = under_test.EvgExpansions(
is_patch="false",
build_id="build_id",
build_variant="build variant",
project="project",
revision="abc123",
task_name="task name",
task_id="task_314",
)
self.assertFalse(evg_expansions.determine_is_patch())
def test_is_patch_is_empty_string_should_return_false(self):
evg_expansions = under_test.EvgExpansions(
is_patch="",
build_id="build_id",
build_variant="build variant",
project="project",
revision="abc123",
task_name="task name",
task_id="task_314",
)
self.assertFalse(evg_expansions.determine_is_patch())
def test_is_patch_is_true_should_return_true(self):
evg_expansions = under_test.EvgExpansions(
is_patch="true",
build_id="build_id",
build_variant="build variant",
project="project",
revision="abc123",
task_name="task name",
task_id="task_314",
)
self.assertTrue(evg_expansions.determine_is_patch())
class TestTranslateRunVar(unittest.TestCase):
def test_normal_value_should_be_returned(self):
run_var = "some value"

View File

@@ -3184,18 +3184,6 @@ tasks:
- func: "send benchmark results"
- func: "analyze benchmark results"
- <<: *benchmark_template
name: benchmarks_bsoncolumn
tags: ["benchmarks"]
commands:
- func: "do benchmark setup"
- func: "run tests"
vars:
suite: benchmarks_bsoncolumn
resmoke_jobs_max: 1
- func: "send benchmark results"
- func: "analyze benchmark results"
- <<: *run_jepsen_template
name: jepsen_register_findAndModify
tags: ["jepsen"]

View File

@@ -15,7 +15,7 @@ variables:
- variant: linux-wt-standalone
name: compile
_real_expansions: &_expansion_updates
[]
[]
###
###
@@ -166,10 +166,6 @@ functions:
params:
script: ./src/dsi/run-dsi determine_failure -m TEST
f_dsi_post_run:
- command: json.send
params:
name: perf
file: ./build/LegacyPerfJson/perf.json
- command: shell.exec
params:
script: ./src/dsi/run-dsi post_run

View File

@@ -22,9 +22,7 @@ variables:
- name: schedule_global_auto_tasks
variant: task_generation
_real_expansions: &_expansion_updates
# TODO: Disable in SERVER-57226.
- key: enable_detect_changes
value: "true"
[]
###
###
@@ -190,12 +188,6 @@ functions:
params:
script: ./src/dsi/run-dsi determine_failure -m TEST
f_dsi_post_run:
# TODO: SERVER-57226 will let us move this json.send to after dsi's post_run.
# This is preferred to keep DSI execution contiguous.
- command: json.send
params:
name: perf
file: ./build/LegacyPerfJson/perf.json
- command: shell.exec
params:
script: ./src/dsi/run-dsi post_run

View File

@@ -9,7 +9,7 @@
*/
'use strict';
const dbPrefix = 'fsmDB_';
const dbPrefix = jsTestName() + '_DB_';
const dbCount = 2;
const collPrefix = 'sharded_coll_';
const collCount = 2;

View File

@@ -7,7 +7,7 @@
*/
'use strict';
const dbPrefix = 'fsmDB_';
const dbPrefix = jsTestName() + '_DB_';
const dbCount = 2;
const collPrefix = 'sharded_coll_';
const collCount = 2;

View File

@@ -10,7 +10,7 @@
* ]
*/
const dbPrefix = 'fsmDB_';
const dbPrefix = jsTestName() + '_DB_';
const dbCount = 2;
const collPrefix = 'sharded_coll_';
const collCount = 2;

View File

@@ -48,9 +48,19 @@ reshardingTest.withReshardingInBackground(
// Wait until participants are aware of the resharding operation.
reshardingTest.awaitCloneTimestampChosen();
const ns = sourceCollection.getFullName();
awaitAbort = startParallelShell(funWithArgs(function(ns) {
db.adminCommand({abortReshardCollection: ns});
}, sourceCollection.getFullName()), mongos.port);
}, ns), mongos.port);
// Wait for the coordinator to have persisted its decision to abort the resharding operation
// as a result of the abortReshardCollection command being processed.
assert.soon(() => {
const coordinatorDoc =
mongos.getCollection("config.reshardingOperations").findOne({ns: ns});
return coordinatorDoc !== null && coordinatorDoc.state === "aborting";
});
},
{
expectedErrorCode: ErrorCodes.ReshardCollectionAborted,

View File

@@ -43,8 +43,9 @@ namespace {
// Start capacity for memory blocks allocated by ElementStorage
constexpr int kStartCapacity = 128;
// Max capacity for memory blocks allocated by ElementStorage
constexpr int kMaxCapacity = 1024 * 32;
// Max capacity for memory blocks allocated by ElementStorage. We need to allow blocks to grow to at
// least BSONObjMaxUserSize so we can construct user objects efficiently.
constexpr int kMaxCapacity = BSONObjMaxUserSize;
// Memory offset to get to BSONElement value when field name is an empty string.
constexpr int kElementValueOffset = 2;

View File

@@ -55,10 +55,14 @@ public:
}
~DeadlineFuture() {
auto lk = stdx::lock_guard(_mutex);
_executor->cancel(_timeoutCbHandle.get());
// The _executor holds the shared ptr on this, the callback will set the promise.
invariant(get().isReady());
{
auto lk = stdx::lock_guard(_mutex);
if (_timeoutCbHandle) {
_executor->cancel(_timeoutCbHandle.get());
}
// The _executor holds the shared ptr on this, the callback will set the promise.
invariant(get().isReady());
}
}
SharedSemiFuture<ResultStatus> get() const {
@@ -104,6 +108,7 @@ private:
std::move(inputFuture).onCompletion([this, self](StatusWith<ResultStatus> status) {
auto lk = stdx::lock_guard(_mutex);
_executor->cancel(_timeoutCbHandle.get());
_timeoutCbHandle = boost::none;
if (!get().isReady()) {
_outputFuturePromise->setFrom(status);
}

View File

@@ -195,15 +195,6 @@ std::vector<AsyncRequestsSender::Response> sendAuthenticatedCommandToShards(
const BSONObj& command,
const std::vector<ShardId>& shardIds,
const std::shared_ptr<executor::TaskExecutor>& executor) {
// TODO SERVER-57519: remove the following scope
{
// Ensure ShardRegistry is initialized before using the AsyncRequestsSender that relies on
// unsafe functions (SERVER-57280)
auto shardRegistry = Grid::get(opCtx)->shardRegistry();
if (!shardRegistry->isUp()) {
shardRegistry->reload(opCtx);
}
}
// The AsyncRequestsSender ignore impersonation metadata so we need to manually attach them to
// the command

View File

@@ -151,9 +151,10 @@ AsyncRequestsSender::RemoteData::RemoteData(AsyncRequestsSender* ars,
BSONObj cmdObj)
: _ars(ars), _shardId(std::move(shardId)), _cmdObj(std::move(cmdObj)) {}
std::shared_ptr<Shard> AsyncRequestsSender::RemoteData::getShard() {
// TODO: Pass down an OperationContext* to use here.
return Grid::get(getGlobalServiceContext())->shardRegistry()->getShardNoReload(_shardId);
SemiFuture<std::shared_ptr<Shard>> AsyncRequestsSender::RemoteData::getShard() noexcept {
return Grid::get(getGlobalServiceContext())
->shardRegistry()
->getShard(*_ars->_subBaton, _shardId);
}
void AsyncRequestsSender::RemoteData::executeRequest() {
@@ -173,7 +174,12 @@ void AsyncRequestsSender::RemoteData::executeRequest() {
auto AsyncRequestsSender::RemoteData::scheduleRequest()
-> SemiFuture<RemoteCommandOnAnyCallbackArgs> {
return resolveShardIdToHostAndPorts(_ars->_readPreference)
return getShard()
.thenRunOn(*_ars->_subBaton)
.then([this](auto&& shard) {
return shard->getTargeter()->findHosts(_ars->_readPreference,
CancellationToken::uncancelable());
})
.thenRunOn(*_ars->_subBaton)
.then([this](auto&& hostAndPorts) {
_shardHostAndPort.emplace(hostAndPorts.front());
@@ -183,17 +189,6 @@ auto AsyncRequestsSender::RemoteData::scheduleRequest()
.semi();
}
SemiFuture<std::vector<HostAndPort>> AsyncRequestsSender::RemoteData::resolveShardIdToHostAndPorts(
const ReadPreferenceSetting& readPref) {
const auto shard = getShard();
if (!shard) {
return Status(ErrorCodes::ShardNotFound,
str::stream() << "Could not find shard " << _shardId);
}
return shard->getTargeter()->findHosts(readPref, CancellationToken::uncancelable());
}
auto AsyncRequestsSender::RemoteData::scheduleRemoteCommand(std::vector<HostAndPort>&& hostAndPorts)
-> SemiFuture<RemoteCommandOnAnyCallbackArgs> {
hangBeforeSchedulingRemoteCommand.executeIf(
@@ -259,43 +254,47 @@ auto AsyncRequestsSender::RemoteData::handleResponse(RemoteCommandOnAnyCallbackA
}
// There was an error with either the response or the command.
auto shard = getShard();
if (!shard) {
uasserted(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << _shardId);
} else {
std::vector<HostAndPort> failedTargets;
return getShard()
.thenRunOn(*_ars->_subBaton)
.then([this, status = std::move(status), rcr = std::move(rcr)](
std::shared_ptr<mongo::Shard>&& shard) {
std::vector<HostAndPort> failedTargets;
if (rcr.response.target) {
failedTargets = {*rcr.response.target};
} else {
failedTargets = rcr.request.target;
}
if (rcr.response.target) {
failedTargets = {*rcr.response.target};
} else {
failedTargets = rcr.request.target;
}
shard->updateReplSetMonitor(failedTargets.front(), status);
bool isStartingTransaction = _cmdObj.getField("startTransaction").booleanSafe();
if (!_ars->_stopRetrying && shard->isRetriableError(status.code(), _ars->_retryPolicy) &&
_retryCount < kMaxNumFailedHostRetryAttempts && !isStartingTransaction) {
shard->updateReplSetMonitor(failedTargets.front(), status);
bool isStartingTransaction = _cmdObj.getField("startTransaction").booleanSafe();
if (!_ars->_stopRetrying &&
shard->isRetriableError(status.code(), _ars->_retryPolicy) &&
_retryCount < kMaxNumFailedHostRetryAttempts && !isStartingTransaction) {
LOGV2_DEBUG(4615637,
1,
"Command to remote {shardId} for hosts {hosts} failed with retryable error "
"{error} and will be retried",
"Command to remote shard failed with retryable error and will be retried",
"shardId"_attr = _shardId,
"hosts"_attr = failedTargets,
"error"_attr = redact(status));
++_retryCount;
_shardHostAndPort.reset();
// retry through recursion
return scheduleRequest();
}
}
LOGV2_DEBUG(
4615637,
1,
"Command to remote {shardId} for hosts {hosts} failed with retryable error "
"{error} and will be retried",
"Command to remote shard failed with retryable error and will be retried",
"shardId"_attr = _shardId,
"hosts"_attr = failedTargets,
"error"_attr = redact(status));
++_retryCount;
_shardHostAndPort.reset();
// retry through recursion
return scheduleRequest();
}
// Status' in the response.status field that aren't retried get converted to top level errors
uassertStatusOK(rcr.response.status);
// Status' in the response.status field that aren't retried get converted to top level
// errors
uassertStatusOK(rcr.response.status);
// We're not okay (on the remote), but still not going to retry
return std::move(rcr);
// We're not okay (on the remote), but still not going to retry
return Future<RemoteCommandOnAnyCallbackArgs>::makeReady(std::move(rcr)).semi();
})
.semi();
};
} // namespace mongo

View File

@@ -177,9 +177,15 @@ private:
RemoteData(AsyncRequestsSender* ars, ShardId shardId, BSONObj cmdObj);
/**
* Returns the Shard object associated with this remote.
* Returns a SemiFuture containing a shard object associated with this remote.
*
* This will return a SemiFuture with a ShardNotFound error status in case the shard is not
* found.
*
* Additionally this call can trigger a refresh of the ShardRegistry so it could possibly
* return other network error status related to the refresh.
*/
std::shared_ptr<Shard> getShard();
SemiFuture<std::shared_ptr<Shard>> getShard() noexcept;
/**
* Returns true if we've already queued a response from the remote.

View File

@@ -218,9 +218,9 @@ void ShardRegistry::startupPeriodicReloader(OperationContext* opCtx) {
AsyncTry([this] {
LOGV2_DEBUG(22726, 1, "Reloading shardRegistry");
return _reloadInternal();
return _reloadAsyncNoRetry();
})
.until([](auto sw) {
.until([](auto&& sw) {
if (!sw.isOK()) {
LOGV2(22727,
"Error running periodic reload of shard registry",
@@ -232,7 +232,7 @@ void ShardRegistry::startupPeriodicReloader(OperationContext* opCtx) {
})
.withDelayBetweenIterations(kRefreshPeriod) // This call is optional.
.on(_executor, CancellationToken::uncancelable())
.getAsync([](auto sw) {
.getAsync([](auto&& sw) {
LOGV2_DEBUG(22725,
1,
"Exiting periodic shard registry reloader",
@@ -295,6 +295,49 @@ StatusWith<std::shared_ptr<Shard>> ShardRegistry::getShard(OperationContext* opC
return {ErrorCodes::ShardNotFound, str::stream() << "Shard " << shardId << " not found"};
}
SemiFuture<std::shared_ptr<Shard>> ShardRegistry::getShard(ExecutorPtr executor,
const ShardId& shardId) noexcept {
// Fetch the shard registry data associated to the latest known topology time
return _getDataAsync()
.thenRunOn(executor)
.then([this, executor, shardId](auto&& cachedData) {
// First check if this is a non config shard lookup
if (auto shard = cachedData->findShard(shardId)) {
return SemiFuture<std::shared_ptr<Shard>>::makeReady(std::move(shard));
}
// then check if this is a config shard (this call is blocking in any case)
{
stdx::lock_guard<Latch> lk(_mutex);
if (auto shard = _configShardData.findShard(shardId)) {
return SemiFuture<std::shared_ptr<Shard>>::makeReady(std::move(shard));
}
}
// If the shard was not found, force reload the shard regitry data and try again.
//
// This is to cover the following scenario:
// 1. Primary of the replicaset fetch the list of shards and store it on disk
// 2. Primary crash before the latest VectorClock topology time is majority written to
// disk
// 3. A new primary with a stale ShardRegistry is elected and read the set of shards
// from disk and calls ShardRegistry::getShard
return _reloadAsync()
.thenRunOn(executor)
.then([this, executor, shardId](auto&& cachedData) -> std::shared_ptr<Shard> {
auto shard = cachedData->findShard(shardId);
uassert(ErrorCodes::ShardNotFound,
str::stream() << "Shard " << shardId << " not found",
shard);
return shard;
})
.semi();
})
.semi();
}
std::vector<ShardId> ShardRegistry::getAllShardIds(OperationContext* opCtx) {
auto shardIds = _getData(opCtx)->getAllShardIds();
if (shardIds.empty()) {
@@ -400,24 +443,27 @@ void ShardRegistry::toBSON(BSONObjBuilder* result) const {
}
void ShardRegistry::reload(OperationContext* opCtx) {
_reloadAsync().get(opCtx);
}
SharedSemiFuture<ShardRegistry::Cache::ValueHandle> ShardRegistry::_reloadAsync() {
if (MONGO_unlikely(TestingProctor::instance().isEnabled())) {
// TODO SERVER-62152 investigate hang on reload in unit tests
// Some unit tests don't support running the reload's AsyncTry on the fixed executor.
_reloadInternal().get(opCtx);
return _reloadAsyncNoRetry();
} else {
AsyncTry([=]() mutable { return _reloadInternal(); })
return AsyncTry([=]() mutable { return _reloadAsyncNoRetry(); })
.until([](auto sw) mutable {
return sw.getStatus() != ErrorCodes::ReadConcernMajorityNotAvailableYet;
})
.withBackoffBetweenIterations(kExponentialBackoff)
.on(Grid::get(opCtx)->getExecutorPool()->getFixedExecutor(),
.on(Grid::get(getGlobalServiceContext())->getExecutorPool()->getFixedExecutor(),
CancellationToken::uncancelable())
.semi()
.get(opCtx);
.share();
}
}
SharedSemiFuture<ShardRegistry::Cache::ValueHandle> ShardRegistry::_reloadInternal() {
SharedSemiFuture<ShardRegistry::Cache::ValueHandle> ShardRegistry::_reloadAsyncNoRetry() {
// Make the next acquire do a lookup.
auto value = _forceReloadIncrement.addAndFetch(1);
LOGV2_DEBUG(4620253, 2, "Forcing ShardRegistry reload", "newForceReloadIncrement"_attr = value);

View File

@@ -239,6 +239,9 @@ public:
*/
StatusWith<std::shared_ptr<Shard>> getShard(OperationContext* opCtx, const ShardId& shardId);
SemiFuture<std::shared_ptr<Shard>> getShard(ExecutorPtr executor,
const ShardId& shardId) noexcept;
/**
* Returns a vector containing all known shard IDs.
* The order of the elements is not guaranteed.
@@ -438,7 +441,8 @@ private:
void _initializeCacheIfNecessary() const;
SharedSemiFuture<Cache::ValueHandle> _reloadInternal();
SharedSemiFuture<Cache::ValueHandle> _reloadAsync();
SharedSemiFuture<Cache::ValueHandle> _reloadAsyncNoRetry();
/**
* Factory to create shards. Never changed after startup so safe to access outside of _mutex.