SERVER-67405: Fix is_patch lookup for build_variant_gen

(cherry picked from commit 1260ae9d64)
SERVER-57519 Make ARS use causally consistent ShardRegistry::getShard() function
2022-07-12 13:47:15 +00:00 · 2022-07-11 21:46:35 +00:00 · 2022-06-09 18:41:13 +00:00 · 2022-06-08 19:52:08 +00:00 · 2022-06-07 20:28:11 +00:00 · 2022-06-06 17:12:52 +00:00
17 changed files with 318 additions and 160 deletions
--- a/buildscripts/evergreen_gen_build_variant.py
+++ b/buildscripts/evergreen_gen_build_variant.py
@@ -59,7 +59,7 @@ class EvgExpansions(BaseModel):
    build_id: str
    build_variant: str
    exec_timeout_secs: Optional[int] = None
-    is_patch: Optional[bool]
+    is_patch: Optional[str]
    project: str
    max_tests_per_suite: Optional[int] = 100
    max_sub_suites: Optional[int] = 5
@@ -83,10 +83,21 @@ class EvgExpansions(BaseModel):

    def get_max_sub_suites(self) -> int:
        """Get the max_sub_suites to use."""
-        if not self.is_patch:
+        if not self.determine_is_patch():
            return self.mainline_max_sub_suites
        return self.max_sub_suites

+    def determine_is_patch(self) -> bool:
+        """
+        Determine if expansions indicate whether the script is being run in a patch build.
+
+        In a patch build, the `is_patch` expansion will be the string value of "true". In a
+        non-patch setting, it will not exist, or be an empty string.
+
+        :return: True if task is being run in a patch build.
+        """
+        return self.is_patch is not None and self.is_patch.lower() == "true"
+
    def build_suite_split_config(self, start_date: datetime,
                                 end_date: datetime) -> SuiteSplitConfig:
        """
@@ -113,7 +124,7 @@ class EvgExpansions(BaseModel):
        """
        return GenTaskOptions(
            create_misc_suite=True,
-            is_patch=self.is_patch,
+            is_patch=self.determine_is_patch(),
            generated_config_dir=GENERATED_CONFIG_DIR,
            use_default_timeouts=False,
            timeout_secs=self.timeout_secs,
--- a/buildscripts/setup_spawnhost_coredump
+++ b/buildscripts/setup_spawnhost_coredump
@@ -2,15 +2,69 @@

 cd $HOME # workaround EVG-12829

-# Communicate to users that logged in before the script started that nothing is ready.
-wall "The setup_spawnhost_coredump script has just started setting up the debugging environment."
+unameOut=$(uname -s)
+case "${unameOut}" in
+    Linux*)     machine=Linux;;
+    Darwin*)    machine=Mac;;
+    CYGWIN*)    machine=Cygwin;;
+    *)          machine="UNKNOWN:${unameOut}"
+esac

-# Write this file that gets cat'ed on login to communicate to users logging in if this setup script is still running.
-echo '+-----------------------------------------------------------------+' > ~/.setup_spawnhost_coredump_progress
-echo '| The setup script is still setting up data files for inspection. |' >> ~/.setup_spawnhost_coredump_progress
-echo '+-----------------------------------------------------------------+' >> ~/.setup_spawnhost_coredump_progress
+if [[ "${machine}" = "Cygwin" ]]; then
+    out_dir="/cygdrive/c/setup_script_output.txt"
+    desktop_dir="/cygdrive/c/Users/Administrator/Desktop"

-cat >> ~/.profile <<EOF
+    {
+        date
+        env
+
+        echo "----------------------"
+        echo -e "\n=> Setting _NT_SOURCE_PATH environment variable for debuggers to pick up source files."
+        src_dir_hash=$(readlink -f /cygdrive/z/data/mci/source-*)
+        full_src_dir="${src_dir_hash}/src"
+        echo "Source Path: [${full_src_dir}]"
+        set -x;
+        setx _NT_SOURCE_PATH "${full_src_dir}"
+        { set +x; } 2>/dev/null
+
+        echo -e "\n=> Setting _NT_SYMBOL_PATH environment variable for debuggers to pick up the symbols."
+        sym_parent_dir=$(readlink -f /cygdrive/z/data/mci/artifacts-*dist_test_debug)
+        sym_dir=$(readlink -f ${sym_parent_dir}/debugsymbols-mongodb*zip)
+        sym_extracted_dir="${sym_parent_dir}/extracted_symbols"
+        full_sym_dir="${sym_extracted_dir}/dist-test/bin"
+        echo "Symbols Dir: [${full_sym_dir}]"
+
+        echo -e "\n=> Extracting Symbol files."
+        set -x;
+        mkdir ${sym_extracted_dir}
+        unzip -n ${sym_dir} -d ${sym_extracted_dir}
+        setx _NT_SYMBOL_PATH "${full_sym_dir};srv*;"
+        { set +x; } 2>/dev/null
+
+        echo -e "\n=> Extracting Core Dump to Desktop."
+        full_dump_dir=$(readlink -f /cygdrive/z/data/mci/artifacts-* | grep -v dist_test)
+        full_dump_parent_dir=$(readlink -f ${full_dump_dir}/mongo-coredumps*tgz)
+        extracted_dump_dir="${full_dump_dir}/extracted_dump"
+        set -x;
+        mkdir ${extracted_dump_dir}
+        tar -xzvf ${full_dump_parent_dir} -C ${extracted_dump_dir}
+        cp ${extracted_dump_dir}/* ${desktop_dir}
+        { set +x; } 2>/dev/null
+        echo "Copied to Desktop."
+
+    } &> ${out_dir}
+
+    cp ${out_dir} ${desktop_dir}
+else
+    # Communicate to users that logged in before the script started that nothing is ready.
+    wall "The setup_spawnhost_coredump script has just started setting up the debugging environment."
+
+    # Write this file that gets cat'ed on login to communicate to users logging in if this setup script is still running.
+    echo '+-----------------------------------------------------------------------------------+' > ~/.setup_spawnhost_coredump_progress
+    echo '| The setup script is still setting up data files for inspection on a [${machine}] host. |' >> ~/.setup_spawnhost_coredump_progress
+    echo '+-----------------------------------------------------------------------------------+' >> ~/.setup_spawnhost_coredump_progress
+
+    cat >> ~/.profile <<EOF
 cat ~/.setup_spawnhost_coredump_progress
 # Coredumps generated by a toolchain built mongodb can be problematic when examined with the system
 # gdb.
@@ -33,54 +87,54 @@ echo "To examine a core dump, type 'gdb ./<binary> ./<core file>'"
 cat ~/.setup_spawnhost_coredump_progress
 EOF

-export PATH=/opt/mongodbtoolchain/gdb/bin:$PATH
-echo 'if [ -f ~/.profile ]; then
+    export PATH=/opt/mongodbtoolchain/gdb/bin:$PATH
+    echo 'if [ -f ~/.profile ]; then
    . ~/.profile
 fi' >> .bash_profile

-# Make a directory on the larger EBS volume. Soft-link it under the home directory. The smaller home
-# volume can have trouble particularly with coredumps from sharded timeouts.
-mkdir /data/debug
-ln -s /data/debug .
-cd debug
+    # Make a directory on the larger EBS volume. Soft-link it under the home directory. The smaller home
+    # volume can have trouble particularly with coredumps from sharded timeouts.
+    mkdir /data/debug
+    ln -s /data/debug .
+    cd debug

-# As the name suggests, pretty printers. Primarily for boost::optional<T>
-git clone git@github.com:ruediger/Boost-Pretty-Printer.git &
+    # As the name suggests, pretty printers. Primarily for boost::optional<T>
+    git clone git@github.com:ruediger/Boost-Pretty-Printer.git &

-# Discover and unarchive necessary files and source code. This will put mongo binaries and their
-# partner .debug files in the same `debug/bin` directory. The `bin` directory will later be symbolic
-# linked into the top-level (`debug`) directory. Shared library files and their debug symbols will
-# be dumped into a `debug/lib` directory for tidiness. The mongo `<reporoot>/src/` directory is soft
-# linked as `debug/src`. The .gdbinit file assumes gdb is being run from the `debug` directory.
-BIN_ARCHIVE=`ls /data/mci/artifacts-*archive_dist_test*/mongo-*.tgz`
-tar --wildcards --strip-components=1 -xzf $BIN_ARCHIVE '*/bin/mongod' '*/bin/mongos' '*/bin/mongo' '*/bin/mongobridge' &
-tar --wildcards --strip-components=1 -xzf $BIN_ARCHIVE '*/lib/*' &
-DBG_ARCHIVE=`ls /data/mci/artifacts-*archive_dist_test_debug/debugsymbols-*.tgz`
-tar --wildcards --strip-components=1 -xzf $DBG_ARCHIVE '*/bin/mongod.debug' '*/bin/mongos.debug' '*/bin/mongo.debug' '*/bin/mongobridge.debug' &
-tar --wildcards --strip-components=1 -xzf $DBG_ARCHIVE '*/lib/*' &
-UNITTEST_ARCHIVE=`ls /data/mci/artifacts-*run_unittests/mongo-unittests-*.tgz`
-tar --wildcards --strip-components=0 -xzf $UNITTEST_ARCHIVE 'bin/*' &
-tar --wildcards -xzf $UNITTEST_ARCHIVE 'lib/*' &
+    # Discover and unarchive necessary files and source code. This will put mongo binaries and their
+    # partner .debug files in the same `debug/bin` directory. The `bin` directory will later be symbolic
+    # linked into the top-level (`debug`) directory. Shared library files and their debug symbols will
+    # be dumped into a `debug/lib` directory for tidiness. The mongo `<reporoot>/src/` directory is soft
+    # linked as `debug/src`. The .gdbinit file assumes gdb is being run from the `debug` directory.
+    BIN_ARCHIVE=`ls /data/mci/artifacts-*archive_dist_test*/mongo-*.tgz`
+    tar --wildcards --strip-components=1 -xzf $BIN_ARCHIVE '*/bin/mongod' '*/bin/mongos' '*/bin/mongo' '*/bin/mongobridge' &
+    tar --wildcards --strip-components=1 -xzf $BIN_ARCHIVE '*/lib/*' &
+    DBG_ARCHIVE=`ls /data/mci/artifacts-*archive_dist_test_debug/debugsymbols-*.tgz`
+    tar --wildcards --strip-components=1 -xzf $DBG_ARCHIVE '*/bin/mongod.debug' '*/bin/mongos.debug' '*/bin/mongo.debug' '*/bin/mongobridge.debug' &
+    tar --wildcards --strip-components=1 -xzf $DBG_ARCHIVE '*/lib/*' &
+    UNITTEST_ARCHIVE=`ls /data/mci/artifacts-*run_unittests/mongo-unittests-*.tgz`
+    tar --wildcards --strip-components=0 -xzf $UNITTEST_ARCHIVE 'bin/*' &
+    tar --wildcards -xzf $UNITTEST_ARCHIVE 'lib/*' &

-SRC_DIR=`find /data/mci/ -maxdepth 1 | grep source`
-ln -s $SRC_DIR/.gdbinit .
-ln -s $SRC_DIR/src src
-ln -s $SRC_DIR/buildscripts buildscripts
+    SRC_DIR=`find /data/mci/ -maxdepth 1 | grep source`
+    ln -s $SRC_DIR/.gdbinit .
+    ln -s $SRC_DIR/src src
+    ln -s $SRC_DIR/buildscripts buildscripts

-# Install pymongo to get the bson library for pretty-printers.
-/opt/mongodbtoolchain/v3/bin/pip3 install -r $SRC_DIR/etc/pip/dev-requirements.txt &
+    # Install pymongo to get the bson library for pretty-printers.
+    /opt/mongodbtoolchain/v3/bin/pip3 install -r $SRC_DIR/etc/pip/dev-requirements.txt &

-COREDUMP_ARCHIVE=`ls /data/mci/artifacts-*/mongo-coredumps-*.tgz`
-tar -xzf $COREDUMP_ARCHIVE &
-echo "Waiting for background processes to complete."
-wait
+    COREDUMP_ARCHIVE=`ls /data/mci/artifacts-*/mongo-coredumps-*.tgz`
+    tar -xzf $COREDUMP_ARCHIVE &
+    echo "Waiting for background processes to complete."
+    wait

-# Symbolic linking all of the executable files is sufficient for `gdb ./mongod ./dump_mongod.core`
-# to succeed. This inadvertantly also links in the ".debug" files which is unnecessary, but
-# harmless. gdb expects the .debug files to live adjacent to the physical binary.
-ln -s bin/* ./
+    # Symbolic linking all of the executable files is sufficient for `gdb ./mongod ./dump_mongod.core`
+    # to succeed. This inadvertantly also links in the ".debug" files which is unnecessary, but
+    # harmless. gdb expects the .debug files to live adjacent to the physical binary.
+    ln -s bin/* ./

-cat >> ~/.gdbinit <<EOF
+    cat >> ~/.gdbinit <<EOF
 set auto-load safe-path /
 set solib-search-path ./lib/
 set pagination off
@@ -96,10 +150,11 @@ boost.register_printers()
 end
 EOF

-echo "dir $HOME/debug" >> ~/.gdbinit
+    echo "dir $HOME/debug" >> ~/.gdbinit

-# Empty out the progress script that warns users about the set script still running when users log in.
-echo "" > ~/.setup_spawnhost_coredump_progress
-# Alert currently logged in users that this setup script has completed. Logging back in will ensure any
-# paths/environment variables will be set as intended.
-wall "The setup_spawnhost_coredump script has completed, please relogin to ensure the right environment variables are set."
+    # Empty out the progress script that warns users about the set script still running when users log in.
+    echo "" > ~/.setup_spawnhost_coredump_progress
+    # Alert currently logged in users that this setup script has completed. Logging back in will ensure any
+    # paths/environment variables will be set as intended.
+    wall "The setup_spawnhost_coredump script has completed, please relogin to ensure the right environment variables are set."
+fi
--- a/buildscripts/tests/test_evergreen_gen_build_variant.py
+++ b/buildscripts/tests/test_evergreen_gen_build_variant.py
@@ -87,7 +87,7 @@ def build_mock_orchestrator(build_expansions=None, task_def_list=None, build_tas
 class TestEvgExpansions(unittest.TestCase):
    def test_get_max_sub_suites_should_use_patch_value_in_patches(self):
        evg_expansions = under_test.EvgExpansions(
-            is_patch=True,
+            is_patch="true",
            max_sub_suites=5,
            mainline_max_sub_suites=1,
            build_id="build_id",
@@ -102,7 +102,7 @@ class TestEvgExpansions(unittest.TestCase):

    def test_get_max_sub_suites_should_use_mainline_value_in_non_patches(self):
        evg_expansions = under_test.EvgExpansions(
-            is_patch=False,
+            is_patch="false",
            max_sub_suites=5,
            mainline_max_sub_suites=1,
            build_id="build_id",
@@ -133,6 +133,60 @@ class TestEvgExpansions(unittest.TestCase):
                         evg_expansions.mainline_max_sub_suites)


+class TestDetermineIsPatch(unittest.TestCase):
+    def test_is_patch_is_none_should_return_false(self):
+        evg_expansions = under_test.EvgExpansions(
+            is_patch=None,
+            build_id="build_id",
+            build_variant="build variant",
+            project="project",
+            revision="abc123",
+            task_name="task name",
+            task_id="task_314",
+        )
+
+        self.assertFalse(evg_expansions.determine_is_patch())
+
+    def test_is_patch_is_false_should_return_false(self):
+        evg_expansions = under_test.EvgExpansions(
+            is_patch="false",
+            build_id="build_id",
+            build_variant="build variant",
+            project="project",
+            revision="abc123",
+            task_name="task name",
+            task_id="task_314",
+        )
+
+        self.assertFalse(evg_expansions.determine_is_patch())
+
+    def test_is_patch_is_empty_string_should_return_false(self):
+        evg_expansions = under_test.EvgExpansions(
+            is_patch="",
+            build_id="build_id",
+            build_variant="build variant",
+            project="project",
+            revision="abc123",
+            task_name="task name",
+            task_id="task_314",
+        )
+
+        self.assertFalse(evg_expansions.determine_is_patch())
+
+    def test_is_patch_is_true_should_return_true(self):
+        evg_expansions = under_test.EvgExpansions(
+            is_patch="true",
+            build_id="build_id",
+            build_variant="build variant",
+            project="project",
+            revision="abc123",
+            task_name="task name",
+            task_id="task_314",
+        )
+
+        self.assertTrue(evg_expansions.determine_is_patch())
+
+
 class TestTranslateRunVar(unittest.TestCase):
    def test_normal_value_should_be_returned(self):
        run_var = "some value"
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -3184,18 +3184,6 @@ tasks:
  - func: "send benchmark results"
  - func: "analyze benchmark results"

- <<: *benchmark_template
-  name: benchmarks_bsoncolumn
-  tags: ["benchmarks"]
-  commands:
-  - func: "do benchmark setup"
-  - func: "run tests"
-    vars:
-      suite: benchmarks_bsoncolumn
-      resmoke_jobs_max: 1
-  - func: "send benchmark results"
-  - func: "analyze benchmark results"
-
 - <<: *run_jepsen_template
  name: jepsen_register_findAndModify
  tags: ["jepsen"]
--- a/etc/perf.yml
+++ b/etc/perf.yml
@@ -15,7 +15,7 @@ variables:
    - variant: linux-wt-standalone
      name: compile
  _real_expansions: &_expansion_updates
-    []
+      []
  ###

  ###
@@ -166,10 +166,6 @@ functions:
      params:
        script: ./src/dsi/run-dsi determine_failure -m TEST
  f_dsi_post_run:
-    - command: json.send
-      params:
-        name: perf
-        file: ./build/LegacyPerfJson/perf.json
    - command: shell.exec
      params:
        script: ./src/dsi/run-dsi post_run
--- a/etc/system_perf.yml
+++ b/etc/system_perf.yml
@@ -22,9 +22,7 @@ variables:
      - name: schedule_global_auto_tasks
        variant: task_generation
  _real_expansions: &_expansion_updates
-    # TODO: Disable in SERVER-57226.
-    - key: enable_detect_changes
-      value: "true"
+      []
  ###

 ###
@@ -190,12 +188,6 @@ functions:
      params:
        script: ./src/dsi/run-dsi determine_failure -m TEST
  f_dsi_post_run:
-    # TODO: SERVER-57226 will let us move this json.send to after dsi's post_run.
-    # This is preferred to keep DSI execution contiguous.
-    - command: json.send
-      params:
-        name: perf
-        file: ./build/LegacyPerfJson/perf.json
    - command: shell.exec
      params:
        script: ./src/dsi/run-dsi post_run
--- a/jstests/concurrency/fsm_workloads/drop_collection_sharded.js
+++ b/jstests/concurrency/fsm_workloads/drop_collection_sharded.js
@@ -9,7 +9,7 @@
 */
 'use strict';

-const dbPrefix = 'fsmDB_';
+const dbPrefix = jsTestName() + '_DB_';
 const dbCount = 2;
 const collPrefix = 'sharded_coll_';
 const collCount = 2;
--- a/jstests/concurrency/fsm_workloads/drop_database_sharded.js
+++ b/jstests/concurrency/fsm_workloads/drop_database_sharded.js
@@ -7,7 +7,7 @@
 */
 'use strict';

-const dbPrefix = 'fsmDB_';
+const dbPrefix = jsTestName() + '_DB_';
 const dbCount = 2;
 const collPrefix = 'sharded_coll_';
 const collCount = 2;
--- a/jstests/concurrency/fsm_workloads/random_DDL_operations.js
+++ b/jstests/concurrency/fsm_workloads/random_DDL_operations.js
@@ -10,7 +10,7 @@
 *  ]
 */

-const dbPrefix = 'fsmDB_';
+const dbPrefix = jsTestName() + '_DB_';
 const dbCount = 2;
 const collPrefix = 'sharded_coll_';
 const collCount = 2;
--- a/jstests/sharding/resharding_coordinator_recovers_abort_decision.js
+++ b/jstests/sharding/resharding_coordinator_recovers_abort_decision.js
@@ -48,9 +48,19 @@ reshardingTest.withReshardingInBackground(
        // Wait until participants are aware of the resharding operation.
        reshardingTest.awaitCloneTimestampChosen();

+        const ns = sourceCollection.getFullName();
        awaitAbort = startParallelShell(funWithArgs(function(ns) {
                                            db.adminCommand({abortReshardCollection: ns});
-                                        }, sourceCollection.getFullName()), mongos.port);
+                                        }, ns), mongos.port);
+
+        // Wait for the coordinator to have persisted its decision to abort the resharding operation
+        // as a result of the abortReshardCollection command being processed.
+        assert.soon(() => {
+            const coordinatorDoc =
+                mongos.getCollection("config.reshardingOperations").findOne({ns: ns});
+
+            return coordinatorDoc !== null && coordinatorDoc.state === "aborting";
+        });
    },
    {
        expectedErrorCode: ErrorCodes.ReshardCollectionAborted,
--- a/src/mongo/bson/util/bsoncolumn.cpp
+++ b/src/mongo/bson/util/bsoncolumn.cpp
@@ -43,8 +43,9 @@ namespace {
 // Start capacity for memory blocks allocated by ElementStorage
 constexpr int kStartCapacity = 128;

-// Max capacity for memory blocks allocated by ElementStorage
-constexpr int kMaxCapacity = 1024 * 32;
+// Max capacity for memory blocks allocated by ElementStorage. We need to allow blocks to grow to at
+// least BSONObjMaxUserSize so we can construct user objects efficiently.
+constexpr int kMaxCapacity = BSONObjMaxUserSize;

 // Memory offset to get to BSONElement value when field name is an empty string.
 constexpr int kElementValueOffset = 2;
--- a/src/mongo/db/process_health/deadline_future.h
+++ b/src/mongo/db/process_health/deadline_future.h
@@ -55,10 +55,14 @@ public:
    }

    ~DeadlineFuture() {
-        auto lk = stdx::lock_guard(_mutex);
-        _executor->cancel(_timeoutCbHandle.get());
-        // The _executor holds the shared ptr on this, the callback will set the promise.
-        invariant(get().isReady());
+        {
+            auto lk = stdx::lock_guard(_mutex);
+            if (_timeoutCbHandle) {
+                _executor->cancel(_timeoutCbHandle.get());
+            }
+            // The _executor holds the shared ptr on this, the callback will set the promise.
+            invariant(get().isReady());
+        }
    }

    SharedSemiFuture<ResultStatus> get() const {
@@ -104,6 +108,7 @@ private:
            std::move(inputFuture).onCompletion([this, self](StatusWith<ResultStatus> status) {
                auto lk = stdx::lock_guard(_mutex);
                _executor->cancel(_timeoutCbHandle.get());
+                _timeoutCbHandle = boost::none;
                if (!get().isReady()) {
                    _outputFuturePromise->setFrom(status);
                }
--- a/src/mongo/db/s/sharding_ddl_util.cpp
+++ b/src/mongo/db/s/sharding_ddl_util.cpp
@@ -195,15 +195,6 @@ std::vector<AsyncRequestsSender::Response> sendAuthenticatedCommandToShards(
    const BSONObj& command,
    const std::vector<ShardId>& shardIds,
    const std::shared_ptr<executor::TaskExecutor>& executor) {
-    // TODO SERVER-57519: remove the following scope
-    {
-        // Ensure ShardRegistry is initialized before using the AsyncRequestsSender that relies on
-        // unsafe functions (SERVER-57280)
-        auto shardRegistry = Grid::get(opCtx)->shardRegistry();
-        if (!shardRegistry->isUp()) {
-            shardRegistry->reload(opCtx);
-        }
-    }

    // The AsyncRequestsSender ignore impersonation metadata so we need to manually attach them to
    // the command
--- a/src/mongo/s/async_requests_sender.cpp
+++ b/src/mongo/s/async_requests_sender.cpp
@@ -151,9 +151,10 @@ AsyncRequestsSender::RemoteData::RemoteData(AsyncRequestsSender* ars,
                                            BSONObj cmdObj)
    : _ars(ars), _shardId(std::move(shardId)), _cmdObj(std::move(cmdObj)) {}

-std::shared_ptr<Shard> AsyncRequestsSender::RemoteData::getShard() {
-    // TODO: Pass down an OperationContext* to use here.
-    return Grid::get(getGlobalServiceContext())->shardRegistry()->getShardNoReload(_shardId);
+SemiFuture<std::shared_ptr<Shard>> AsyncRequestsSender::RemoteData::getShard() noexcept {
+    return Grid::get(getGlobalServiceContext())
+        ->shardRegistry()
+        ->getShard(*_ars->_subBaton, _shardId);
 }

 void AsyncRequestsSender::RemoteData::executeRequest() {
@@ -173,7 +174,12 @@ void AsyncRequestsSender::RemoteData::executeRequest() {

 auto AsyncRequestsSender::RemoteData::scheduleRequest()
    -> SemiFuture<RemoteCommandOnAnyCallbackArgs> {
-    return resolveShardIdToHostAndPorts(_ars->_readPreference)
+    return getShard()
+        .thenRunOn(*_ars->_subBaton)
+        .then([this](auto&& shard) {
+            return shard->getTargeter()->findHosts(_ars->_readPreference,
+                                                   CancellationToken::uncancelable());
+        })
        .thenRunOn(*_ars->_subBaton)
        .then([this](auto&& hostAndPorts) {
            _shardHostAndPort.emplace(hostAndPorts.front());
@@ -183,17 +189,6 @@ auto AsyncRequestsSender::RemoteData::scheduleRequest()
        .semi();
 }

-SemiFuture<std::vector<HostAndPort>> AsyncRequestsSender::RemoteData::resolveShardIdToHostAndPorts(
-    const ReadPreferenceSetting& readPref) {
-    const auto shard = getShard();
-    if (!shard) {
-        return Status(ErrorCodes::ShardNotFound,
-                      str::stream() << "Could not find shard " << _shardId);
-    }
-
-    return shard->getTargeter()->findHosts(readPref, CancellationToken::uncancelable());
-}
-
 auto AsyncRequestsSender::RemoteData::scheduleRemoteCommand(std::vector<HostAndPort>&& hostAndPorts)
    -> SemiFuture<RemoteCommandOnAnyCallbackArgs> {
    hangBeforeSchedulingRemoteCommand.executeIf(
@@ -259,43 +254,47 @@ auto AsyncRequestsSender::RemoteData::handleResponse(RemoteCommandOnAnyCallbackA
    }

    // There was an error with either the response or the command.
-    auto shard = getShard();
-    if (!shard) {
-        uasserted(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << _shardId);
-    } else {
-        std::vector<HostAndPort> failedTargets;
+    return getShard()
+        .thenRunOn(*_ars->_subBaton)
+        .then([this, status = std::move(status), rcr = std::move(rcr)](
+                  std::shared_ptr<mongo::Shard>&& shard) {
+            std::vector<HostAndPort> failedTargets;

-        if (rcr.response.target) {
-            failedTargets = {*rcr.response.target};
-        } else {
-            failedTargets = rcr.request.target;
-        }
+            if (rcr.response.target) {
+                failedTargets = {*rcr.response.target};
+            } else {
+                failedTargets = rcr.request.target;
+            }

-        shard->updateReplSetMonitor(failedTargets.front(), status);
-        bool isStartingTransaction = _cmdObj.getField("startTransaction").booleanSafe();
-        if (!_ars->_stopRetrying && shard->isRetriableError(status.code(), _ars->_retryPolicy) &&
-            _retryCount < kMaxNumFailedHostRetryAttempts && !isStartingTransaction) {
+            shard->updateReplSetMonitor(failedTargets.front(), status);
+            bool isStartingTransaction = _cmdObj.getField("startTransaction").booleanSafe();
+            if (!_ars->_stopRetrying &&
+                shard->isRetriableError(status.code(), _ars->_retryPolicy) &&
+                _retryCount < kMaxNumFailedHostRetryAttempts && !isStartingTransaction) {

-            LOGV2_DEBUG(4615637,
-                        1,
-                        "Command to remote {shardId} for hosts {hosts} failed with retryable error "
-                        "{error} and will be retried",
-                        "Command to remote shard failed with retryable error and will be retried",
-                        "shardId"_attr = _shardId,
-                        "hosts"_attr = failedTargets,
-                        "error"_attr = redact(status));
-            ++_retryCount;
-            _shardHostAndPort.reset();
-            // retry through recursion
-            return scheduleRequest();
-        }
-    }
+                LOGV2_DEBUG(
+                    4615637,
+                    1,
+                    "Command to remote {shardId} for hosts {hosts} failed with retryable error "
+                    "{error} and will be retried",
+                    "Command to remote shard failed with retryable error and will be retried",
+                    "shardId"_attr = _shardId,
+                    "hosts"_attr = failedTargets,
+                    "error"_attr = redact(status));
+                ++_retryCount;
+                _shardHostAndPort.reset();
+                // retry through recursion
+                return scheduleRequest();
+            }

-    // Status' in the response.status field that aren't retried get converted to top level errors
-    uassertStatusOK(rcr.response.status);
+            // Status' in the response.status field that aren't retried get converted to top level
+            // errors
+            uassertStatusOK(rcr.response.status);

-    // We're not okay (on the remote), but still not going to retry
-    return std::move(rcr);
+            // We're not okay (on the remote), but still not going to retry
+            return Future<RemoteCommandOnAnyCallbackArgs>::makeReady(std::move(rcr)).semi();
+        })
+        .semi();
 };

 }  // namespace mongo
--- a/src/mongo/s/async_requests_sender.h
+++ b/src/mongo/s/async_requests_sender.h
@@ -177,9 +177,15 @@ private:
        RemoteData(AsyncRequestsSender* ars, ShardId shardId, BSONObj cmdObj);

        /**
-         * Returns the Shard object associated with this remote.
+         * Returns a SemiFuture containing a shard object associated with this remote.
+         *
+         * This will return a SemiFuture with a ShardNotFound error status in case the shard is not
+         * found.
+         *
+         * Additionally this call can trigger a refresh of the ShardRegistry so it could possibly
+         * return other network error status related to the refresh.
         */
-        std::shared_ptr<Shard> getShard();
+        SemiFuture<std::shared_ptr<Shard>> getShard() noexcept;

        /**
         * Returns true if we've already queued a response from the remote.
--- a/src/mongo/s/client/shard_registry.cpp
+++ b/src/mongo/s/client/shard_registry.cpp
@@ -218,9 +218,9 @@ void ShardRegistry::startupPeriodicReloader(OperationContext* opCtx) {

    AsyncTry([this] {
        LOGV2_DEBUG(22726, 1, "Reloading shardRegistry");
-        return _reloadInternal();
+        return _reloadAsyncNoRetry();
    })
-        .until([](auto sw) {
+        .until([](auto&& sw) {
            if (!sw.isOK()) {
                LOGV2(22727,
                      "Error running periodic reload of shard registry",
@@ -232,7 +232,7 @@ void ShardRegistry::startupPeriodicReloader(OperationContext* opCtx) {
        })
        .withDelayBetweenIterations(kRefreshPeriod)  // This call is optional.
        .on(_executor, CancellationToken::uncancelable())
-        .getAsync([](auto sw) {
+        .getAsync([](auto&& sw) {
            LOGV2_DEBUG(22725,
                        1,
                        "Exiting periodic shard registry reloader",
@@ -295,6 +295,49 @@ StatusWith<std::shared_ptr<Shard>> ShardRegistry::getShard(OperationContext* opC
    return {ErrorCodes::ShardNotFound, str::stream() << "Shard " << shardId << " not found"};
 }

+SemiFuture<std::shared_ptr<Shard>> ShardRegistry::getShard(ExecutorPtr executor,
+                                                           const ShardId& shardId) noexcept {
+
+    // Fetch the shard registry data associated to the latest known topology time
+    return _getDataAsync()
+        .thenRunOn(executor)
+        .then([this, executor, shardId](auto&& cachedData) {
+            // First check if this is a non config shard lookup
+            if (auto shard = cachedData->findShard(shardId)) {
+                return SemiFuture<std::shared_ptr<Shard>>::makeReady(std::move(shard));
+            }
+
+            // then check if this is a config shard (this call is blocking in any case)
+            {
+                stdx::lock_guard<Latch> lk(_mutex);
+                if (auto shard = _configShardData.findShard(shardId)) {
+                    return SemiFuture<std::shared_ptr<Shard>>::makeReady(std::move(shard));
+                }
+            }
+
+            // If the shard was not found, force reload the shard regitry data and try again.
+            //
+            // This is to cover the following scenario:
+            // 1. Primary of the replicaset fetch the list of shards and store it on disk
+            // 2. Primary crash before the latest VectorClock topology time is majority written to
+            //    disk
+            // 3. A new primary with a stale ShardRegistry is elected and read the set of shards
+            //    from disk and calls ShardRegistry::getShard
+
+            return _reloadAsync()
+                .thenRunOn(executor)
+                .then([this, executor, shardId](auto&& cachedData) -> std::shared_ptr<Shard> {
+                    auto shard = cachedData->findShard(shardId);
+                    uassert(ErrorCodes::ShardNotFound,
+                            str::stream() << "Shard " << shardId << " not found",
+                            shard);
+                    return shard;
+                })
+                .semi();
+        })
+        .semi();
+}
+
 std::vector<ShardId> ShardRegistry::getAllShardIds(OperationContext* opCtx) {
    auto shardIds = _getData(opCtx)->getAllShardIds();
    if (shardIds.empty()) {
@@ -400,24 +443,27 @@ void ShardRegistry::toBSON(BSONObjBuilder* result) const {
 }

 void ShardRegistry::reload(OperationContext* opCtx) {
+    _reloadAsync().get(opCtx);
+}
+
+SharedSemiFuture<ShardRegistry::Cache::ValueHandle> ShardRegistry::_reloadAsync() {
    if (MONGO_unlikely(TestingProctor::instance().isEnabled())) {
        // TODO SERVER-62152 investigate hang on reload in unit tests
        // Some unit tests don't support running the reload's AsyncTry on the fixed executor.
-        _reloadInternal().get(opCtx);
+        return _reloadAsyncNoRetry();
    } else {
-        AsyncTry([=]() mutable { return _reloadInternal(); })
+        return AsyncTry([=]() mutable { return _reloadAsyncNoRetry(); })
            .until([](auto sw) mutable {
                return sw.getStatus() != ErrorCodes::ReadConcernMajorityNotAvailableYet;
            })
            .withBackoffBetweenIterations(kExponentialBackoff)
-            .on(Grid::get(opCtx)->getExecutorPool()->getFixedExecutor(),
+            .on(Grid::get(getGlobalServiceContext())->getExecutorPool()->getFixedExecutor(),
                CancellationToken::uncancelable())
-            .semi()
-            .get(opCtx);
+            .share();
    }
 }

-SharedSemiFuture<ShardRegistry::Cache::ValueHandle> ShardRegistry::_reloadInternal() {
+SharedSemiFuture<ShardRegistry::Cache::ValueHandle> ShardRegistry::_reloadAsyncNoRetry() {
    // Make the next acquire do a lookup.
    auto value = _forceReloadIncrement.addAndFetch(1);
    LOGV2_DEBUG(4620253, 2, "Forcing ShardRegistry reload", "newForceReloadIncrement"_attr = value);
--- a/src/mongo/s/client/shard_registry.h
+++ b/src/mongo/s/client/shard_registry.h
@@ -239,6 +239,9 @@ public:
     */
    StatusWith<std::shared_ptr<Shard>> getShard(OperationContext* opCtx, const ShardId& shardId);

+    SemiFuture<std::shared_ptr<Shard>> getShard(ExecutorPtr executor,
+                                                const ShardId& shardId) noexcept;
+
    /**
     * Returns a vector containing all known shard IDs.
     * The order of the elements is not guaranteed.
@@ -438,7 +441,8 @@ private:

    void _initializeCacheIfNecessary() const;

-    SharedSemiFuture<Cache::ValueHandle> _reloadInternal();
+    SharedSemiFuture<Cache::ValueHandle> _reloadAsync();
+    SharedSemiFuture<Cache::ValueHandle> _reloadAsyncNoRetry();

    /**
     * Factory to create shards.  Never changed after startup so safe to access outside of _mutex.
Author	SHA1	Message	Date
David Bradford	08077e4d70	SERVER-67405: Fix is_patch lookup for build_variant_gen (cherry picked from commit `1260ae9d64`)	2022-07-12 13:47:15 +00:00
Tommaso Tocci	4f9e10bb30	SERVER-57519 Make ARS use causally consistent ShardRegistry::getShard() function (cherry picked from commit `527dfe85a1`)	2022-07-11 21:46:35 +00:00
Henrik Edin	894acad323	SERVER-64307 Remove BSONColumn benchmarks from running on Evergreen (cherry picked from commit `3928c9d541`)	2022-06-09 18:41:13 +00:00
Henrik Edin	a86bb832e1	SERVER-65100 Increase block size for BSONColumn to be BSONObjMaxUserSize (cherry picked from commit `0c2e3ff947`)	2022-06-08 19:52:08 +00:00
Max Hirschhorn	3276296336	SERVER-66618 Wait in test for resharding coordinator to persist abort. Fixes an issue in the resharding_coordinator_recovers_abort_decision.js test where the reshardingPauseCoordinatorBeforeBlockingWrites failpoint is being released too early and allowing the resharding coordinator to decide to commit the resharding operation instead. (cherry picked from commit `18ec837622`)	2022-06-07 20:28:11 +00:00
Vlad Rachev	de31372c6f	SERVER-66955 Remove JSON.send usage in sys-perf and performance (cherry picked from commit 421855380204e5a9dcde2b83cccef8e6f75e3a96) (cherry picked from commit 49e8e9786b5a074e3c28793e1ab7c425453cd14f)	2022-06-06 17:12:52 +00:00
Andrew Shuvalov	814159344a	SERVER-66843 Use defensive programming in DeadlineFuture destructor (cherry picked from commit `cae77d5066`)	2022-06-03 17:21:59 +00:00
Tommaso Tocci	58d6f304e9	SERVER-66860 FSM tests should not reuse same database names (cherry picked from commit `d88a892d5b`)	2022-06-01 08:52:30 +00:00
Matt Kneiser	64ae4a253e	SERVER-66769 Add Windows support to spawnhost setup script (cherry picked from commit `ad379f1338`)	2022-05-31 17:34:02 +00:00