diff --git a/buildscripts/evergreen_gen_multiversion_tests.py b/buildscripts/evergreen_gen_multiversion_tests.py index 5105e68ed98..a8a750ef85e 100755 --- a/buildscripts/evergreen_gen_multiversion_tests.py +++ b/buildscripts/evergreen_gen_multiversion_tests.py @@ -13,6 +13,7 @@ import requests import click import structlog +from buildscripts.resmokelib.core.programs import get_path_env_var from buildscripts.resmokelib.multiversionconstants import ( LAST_LTS_MONGO_BINARY, LAST_CONTINUOUS_MONGO_BINARY, REQUIRES_FCV_TAG) from buildscripts.util.cmdutils import enable_logging @@ -38,10 +39,15 @@ BACKPORTS_REQUIRED_BASE_URL = "https://raw.githubusercontent.com/mongodb/mongo" def get_backports_required_hash_for_shell_version(mongo_shell_path=None): """Parse the last-lts shell binary to get the commit hash.""" + env_vars = {} + path = get_path_env_var(env_vars=env_vars) + env_vars["PATH"] = os.pathsep.join(path) + if platform.startswith("win"): - shell_version = check_output([mongo_shell_path + ".exe", "--version"]).decode('utf-8') + shell_version = check_output([mongo_shell_path + ".exe", "--version"], + env=env_vars).decode('utf-8') else: - shell_version = check_output([mongo_shell_path, "--version"]).decode('utf-8') + shell_version = check_output([mongo_shell_path, "--version"], env=env_vars).decode('utf-8') for line in shell_version.splitlines(): if "gitVersion" in line: version_line = line.split(':')[1] diff --git a/buildscripts/resmokeconfig/suites/retryable_writes_downgrade_passthrough.yml b/buildscripts/resmokeconfig/suites/retryable_writes_downgrade_passthrough.yml new file mode 100644 index 00000000000..5e870164665 --- /dev/null +++ b/buildscripts/resmokeconfig/suites/retryable_writes_downgrade_passthrough.yml @@ -0,0 +1,181 @@ +test_kind: js_test +use_in_multiversion: retryable_writes + +selector: + roots: + - jstests/core/**/*.js + exclude_files: + # Transactions do not support retryability of individual operations. + # TODO: Remove this once it is supported (SERVER-33952). + - jstests/core/txns/**/*.js + + # No-op retries are not ignored by top, the profiler, or opcount. + - jstests/core/operation_latency_histogram.js + - jstests/core/profile2.js + - jstests/core/profile3.js + - jstests/core/profile_findandmodify.js + - jstests/core/top.js + - jstests/core/views/views_stats.js + + # TODO SERVER-31242: findAndModify no-op retry should respect the fields option. + - jstests/core/crud_api.js + - jstests/core/find_and_modify.js + - jstests/core/find_and_modify2.js + - jstests/core/find_and_modify_server6865.js + + # Stepdown commands during fsync lock will fail. + - jstests/core/currentop.js + - jstests/core/fsync.js + - jstests/core/killop_drop_collection.js + + # Expect drops/creates to fail or have a certain response: + - jstests/core/explain_upsert.js + - jstests/core/indexes_multiple_commands.js + + # Expect certain responses, but retries of successfully completed commands may return + # different values: + - jstests/core/create_indexes.js + - jstests/core/objid5.js + + # Expect results to return in a certain order, secondaries may apply ops out of order. + - jstests/core/coveredIndex1.js + - jstests/core/sortc.js + + - jstests/core/bench_test*.js # benchRun() used for writes + - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes + - jstests/core/connection_string_validation.js # Does not expect a replica set connection string. + - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock. + - jstests/core/insert2.js # Creates new mongo connection. + - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover. + - jstests/core/startup_log.js # Checks pid, which is different on each server. + + # Creates new mongo connection but won't retry connecting. + - jstests/core/shell_connection_strings.js + + # Inserts enough data that recovery takes more than 8 seconds, so we never get a working primary. + - jstests/core/geo_s2ordering.js + + exclude_with_any_tags: + - assumes_standalone_mongod + ## + # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js + # override when it refuses to run a certain command. Above each tag are the message(s) that cause + # the tag to be warranted. + ## + # "Refusing to run a test that issues a getMore command since if a network error occurs during + # it then we won't know whether the cursor was advanced or not" + - requires_getmore + # "Refusing to run a test that issues non-retryable write operations since the test likely makes + # assertions on the write results and can lead to spurious failures if a network error occurs" + - requires_non_retryable_writes + # "Refusing to run a test that issues commands that are not blindly retryable" + # "Refusing to run a test that issues an aggregation command with $out because it is not + # retryable" + - requires_non_retryable_commands + # "Refusing to run a test that issues commands that may return different values after a failover" + # "Refusing to run a test that issues an aggregation command with explain because it may return + # incomplete results" + # "Refusing to run a test that issues an aggregation command with + # $listLocalSessions because it relies on in-memory state that may not survive failovers" + # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if + # interrupted by a stepdown" + - does_not_support_stepdowns + ## + # The next three tags correspond to the special errors thrown by the + # set_read_and_write_concerns.js override when it refuses to replace the readConcern or + # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be + # warranted. + ## + # "Cowardly refusing to override read concern of command: ..." + - assumes_read_concern_unchanged + # "Cowardly refusing to override write concern of command: ..." + - assumes_write_concern_unchanged + ## + # The next three tags corresponds to the special errors thrown by the + # fail_unclean_shutdown_incompatible_commands.js override when it refuses to run commands that are + # inaccurate after an unclean shutdown. Above each tag is the message that causes the tag to be + # warranted. + ## + # "Cowardly fail if fastcount is run with a mongod that had an unclean shutdown: ..." + - requires_fastcount + # "Cowardly fail if dbStats is run with a mongod that had an unclean shutdown: ..." + - requires_dbstats + # "Cowardly fail if collStats is run with a mongod that had an unclean shutdown: ..." + - requires_collstats + # "Cowardly fail if unbounded dataSize is run with a mongod that had an unclean shutdown: ..." + - requires_datasize + ## The next tag corresponds to long running-operations, as they may exhaust their number + # of retries and result in a network error being thrown. + - operations_longer_than_stepdown_interval + # Operations in the main test shell aren't guaranteed to be causally consistent with operations + # performed earlier in a parallel shell if multiple nodes are electable because the latest + # operation and cluster times aren't shared between shells. + # "Cowardly refusing to run test with network retries enabled when it uses startParallelShell()" + - uses_parallel_shell + +executor: + archive: + tests: true + hooks: + - CheckReplDBHash + - CheckReplOplogs + - ValidateCollections + config: + shell_options: + eval: >- + testingReplication = true; + load('jstests/libs/override_methods/network_error_and_txn_override.js'); + db = connect(TestData.connectionString); + load("jstests/libs/override_methods/enable_sessions.js"); + load("jstests/libs/override_methods/set_read_and_write_concerns.js"); + load("jstests/libs/override_methods/fail_unclean_shutdown_incompatible_commands.js"); + load("jstests/libs/override_methods/fail_unclean_shutdown_start_parallel_shell.js"); + global_vars: + TestData: + alwaysInjectTransactionNumber: true + defaultReadConcernLevel: "majority" + logRetryAttempts: true + networkErrorAndTxnOverrideConfig: + retryOnNetworkErrors: true + overrideRetryAttempts: 3 + sessionOptions: + readConcern: + level: "majority" + # Force DBClientRS to find the primary for non-write commands. + readPreference: + mode: "primary" + retryWrites: true + # We specify nodb so the shell used by each test will attempt to connect after loading the + # retry logic in auto_retry_on_network_error.js. + nodb: "" + hooks: + - class: ContinuousStepdown + should_downgrade: true + # The CheckReplDBHash hook waits until all operations have replicated to and have been applied + # on the secondaries, so we run the ValidateCollections hook after it to ensure we're + # validating the entire contents of the collection. + - class: CheckReplOplogs + - class: CheckReplDBHash + - class: ValidateCollections + - class: CleanEveryN + n: 20 + fixture: + class: ReplicaSetFixture + mongod_options: + enableMajorityReadConcern: '' + syncdelay: 5 + wiredTigerEngineConfigString: "debug_mode=(table_logging=true)" + set_parameters: + enableTestCommands: 1 + enableElectionHandoff: 0 + logComponentVerbosity: + replication: + heartbeats: 2 + rollback: 2 + storage: + recovery: 2 + all_nodes_electable: true + num_nodes: 3 + replset_config_options: + settings: + catchUpTimeoutMillis: 0 diff --git a/buildscripts/resmokelib/core/programs.py b/buildscripts/resmokelib/core/programs.py index 2a9b3715957..a2d6b3fd5fd 100644 --- a/buildscripts/resmokelib/core/programs.py +++ b/buildscripts/resmokelib/core/programs.py @@ -30,23 +30,29 @@ def make_process(*args, **kwargs): # Add the current working directory and /data/multiversion to the PATH. env_vars = kwargs.get("env_vars", {}).copy() - path = [ - os.getcwd(), - config.DEFAULT_MULTIVERSION_DIR, - ] + path = get_path_env_var(env_vars) - # If installDir is provided, add it early to the path if config.INSTALL_DIR is not None: - path.append(config.INSTALL_DIR) env_vars["INSTALL_DIR"] = config.INSTALL_DIR - path.append(env_vars.get("PATH", os.environ.get("PATH", ""))) - env_vars["PATH"] = os.pathsep.join(path) kwargs["env_vars"] = env_vars return process_cls(*args, **kwargs) +def get_path_env_var(env_vars): + """Return the path base on provided environment variable.""" + path = [ + os.getcwd(), + config.DEFAULT_MULTIVERSION_DIR, + ] + # If installDir is provided, add it early to the path + if config.INSTALL_DIR is not None: + path.append(config.INSTALL_DIR) + path.append(env_vars.get("PATH", os.environ.get("PATH", ""))) + return path + + def mongod_program(logger, job_num, executable, process_kwargs, mongod_options): """ Return a Process instance that starts mongod arguments constructed from 'mongod_options'. diff --git a/buildscripts/resmokelib/testing/fixtures/_builder.py b/buildscripts/resmokelib/testing/fixtures/_builder.py index 8c74ddfd841..12e056e76cc 100644 --- a/buildscripts/resmokelib/testing/fixtures/_builder.py +++ b/buildscripts/resmokelib/testing/fixtures/_builder.py @@ -52,10 +52,19 @@ class FixtureBuilder(ABC, metaclass=registry.make_registry_metaclass(_BUILDERS, return +class BinVersionEnum(object): + """Enumeration version types.""" + + OLD = 'old' + NEW = 'new' + + class ReplSetBuilder(FixtureBuilder): """Builder class for fixtures support replication.""" REGISTERED_NAME = "ReplicaSetFixture" + latest_class = "MongoDFixture" + multiversion_class_suffix = "_multiversion_class_suffix" def build_fixture(self, logger, job_num, fixturelib, *args, **kwargs): # pylint: disable=too-many-locals """Build a replica set.""" @@ -76,48 +85,47 @@ class ReplSetBuilder(FixtureBuilder): kwargs["mongod_executable"] = mongod_executable num_nodes = kwargs["num_nodes"] latest_mongod = mongod_executable - latest_class = "MongoDFixture" - executables = [] - classes = [] - fcv = None - multiversion_class_suffix = "_" + old_bin_version - shell_version = { - config.MultiversionOptions.LAST_LTS: - multiversionconstants.LAST_LTS_MONGO_BINARY, - config.MultiversionOptions.LAST_CONTINUOUS: - multiversionconstants.LAST_CONTINUOUS_MONGO_BINARY - }[old_bin_version] + fcv = multiversionconstants.LATEST_FCV - mongod_version = { - config.MultiversionOptions.LAST_LTS: - multiversionconstants.LAST_LTS_MONGOD_BINARY, - config.MultiversionOptions.LAST_CONTINUOUS: - multiversionconstants.LAST_CONTINUOUS_MONGOD_BINARY - }[old_bin_version] + executables = {BinVersionEnum.NEW: latest_mongod} + classes = {BinVersionEnum.NEW: self.latest_class} - if mixed_bin_versions is None: - executables = [latest_mongod for x in range(num_nodes)] - classes = [latest_class for x in range(num_nodes)] - else: + # Default to NEW for all bin versions; may be overridden below. + mongod_binary_versions = [BinVersionEnum.NEW for _ in range(num_nodes)] + + is_multiversion = mixed_bin_versions is not None + if is_multiversion: + old_shell_version = { + config.MultiversionOptions.LAST_LTS: + multiversionconstants.LAST_LTS_MONGO_BINARY, + config.MultiversionOptions.LAST_CONTINUOUS: + multiversionconstants.LAST_CONTINUOUS_MONGO_BINARY + }[old_bin_version] + + old_mongod_version = { + config.MultiversionOptions.LAST_LTS: + multiversionconstants.LAST_LTS_MONGOD_BINARY, + config.MultiversionOptions.LAST_CONTINUOUS: + multiversionconstants.LAST_CONTINUOUS_MONGOD_BINARY + }[old_bin_version] + + executables[BinVersionEnum.OLD] = old_mongod_version + classes[BinVersionEnum.OLD] = f"{self.latest_class}{self.multiversion_class_suffix}" + + load_version(version_path_suffix=self.multiversion_class_suffix, + shell_path=old_shell_version) is_config_svr = "configsvr" in replset_config_options and replset_config_options[ "configsvr"] - load_version(version_path_suffix=multiversion_class_suffix, shell_path=shell_version) if not is_config_svr: - executables = [ - latest_mongod if (x == "new") else mongod_version for x in mixed_bin_versions - ] - classes = [ - latest_class if (x == "new") else f"{latest_class}{multiversion_class_suffix}" - for x in mixed_bin_versions - ] - if is_config_svr: + mongod_binary_versions = [x for x in mixed_bin_versions] + else: + # Our documented recommended path for upgrading shards lets us assume that config # server nodes will always be fully upgraded before shard nodes. - executables = [latest_mongod, latest_mongod] - classes = [latest_class, latest_class] + mongod_binary_versions = [BinVersionEnum.NEW] * 2 num_versions = len(mixed_bin_versions) fcv = { @@ -134,8 +142,9 @@ class ReplSetBuilder(FixtureBuilder): replset = _FIXTURES[self.REGISTERED_NAME](logger, job_num, fixturelib, *args, **kwargs) replset.set_fcv(fcv) - for i in range(replset.num_nodes): - node = self._new_mongod(replset, i, executables[i], classes[i]) + for node_index in range(replset.num_nodes): + node = self._new_mongod(replset, node_index, executables, classes, + mongod_binary_versions[node_index], is_multiversion) replset.install_mongod(node) if replset.start_initial_sync_node: @@ -143,19 +152,39 @@ class ReplSetBuilder(FixtureBuilder): replset.initial_sync_node_idx = replset.num_nodes # TODO: This adds the linear chain and steady state param now, is that ok? replset.initial_sync_node = self._new_mongod(replset, replset.initial_sync_node_idx, - latest_mongod, latest_class) + executables, classes, + BinVersionEnum.NEW, is_multiversion) return replset @classmethod - def _new_mongod(cls, replset, index, executable, mongod_class): + def _new_mongod(cls, replset, replset_node_index, executables, classes, cur_version, + is_multiversion): + # pylint: disable=too-many-arguments """Return a standalone.MongoDFixture configured to be used as replica-set member.""" - mongod_logger = replset.get_logger_for_mongod(index) - mongod_options = replset.get_options_for_mongod(index) + mongod_logger = replset.get_logger_for_mongod(replset_node_index) + mongod_options = replset.get_options_for_mongod(replset_node_index) - return make_fixture(mongod_class, mongod_logger, replset.job_num, - mongod_executable=executable, mongod_options=mongod_options, - preserve_dbpath=replset.preserve_dbpath) + new_fixture_port = None + old_fixture = None + + # There is more than one class for mongod, this means we're in multiversion mode. + if is_multiversion: + old_fixture = make_fixture(classes[BinVersionEnum.OLD], mongod_logger, replset.job_num, + mongod_executable=executables[BinVersionEnum.OLD], + mongod_options=mongod_options, + preserve_dbpath=replset.preserve_dbpath) + + # Assign the same port for old and new fixtures so upgrade/downgrade can be done without + # changing the replicaset config. + new_fixture_port = old_fixture.port + + new_fixture = make_fixture(classes[BinVersionEnum.NEW], mongod_logger, replset.job_num, + mongod_executable=executables[BinVersionEnum.NEW], + mongod_options=mongod_options, + preserve_dbpath=replset.preserve_dbpath, port=new_fixture_port) + + return FixtureContainer(new_fixture, old_fixture, cur_version) def load_version(version_path_suffix=None, shell_path=None): @@ -165,7 +194,7 @@ def load_version(version_path_suffix=None, shell_path=None): retrieve_dir = os.path.relpath(os.path.join(RETRIEVE_DIR, version_path_suffix)) if not os.path.exists(retrieve_dir): try: - # Avoud circular import + # Avoid circular import import buildscripts.evergreen_gen_multiversion_tests as gen_tests commit = gen_tests.get_backports_required_hash_for_shell_version( mongo_shell_path=shell_path) @@ -190,3 +219,48 @@ def retrieve_fixtures(directory, commit): output = os.path.join(directory, blob.name) with io.BytesIO(blob.data_stream.read()) as retrieved, open(output, "w") as file: file.write(retrieved.read().decode("utf-8")) + + +class FixtureContainer(object): + """Provide automatic state change between old and new fixture.""" + + attributes = ["_fixtures", "cur_version_cls", "get_cur_version"] + + def __init__(self, new_fixture, old_fixture=None, cur_version=None): + """Initialize FixtureContainer.""" + + if old_fixture is not None: + self._fixtures = {BinVersionEnum.NEW: new_fixture, BinVersionEnum.OLD: old_fixture} + self.cur_version_cls = self._fixtures[cur_version] + else: + # No need to support dictionary of fixture classes if only a single version of + # fixtures is used. + self._fixtures = None + self.cur_version_cls = new_fixture + + def change_version_if_needed(self, node): + """ + Upgrade or downgrade the fixture version to be different to that of `node`. + + @returns a boolean of whether the version was changed. + """ + if self.cur_version_cls == node.get_cur_version(): + for ver, cls in self._fixtures.items(): + if ver != node.get_cur_version(): + self.cur_version_cls = cls + return True + else: + return False + + def get_cur_version(self): + """Get current fixture version from FixtureContainer.""" + return self.cur_version_cls + + def __getattr__(self, name): + return self.cur_version_cls.__getattribute__(name) + + def __setattr__(self, key, value): + if key in FixtureContainer.attributes: + return object.__setattr__(self, key, value) + else: + return self.cur_version_cls.__setattr__(key, value) diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py index 1251c0607a0..6cbcca1d403 100644 --- a/buildscripts/resmokelib/testing/fixtures/replicaset.py +++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py @@ -1,6 +1,7 @@ """Replica set fixture for executing JSTests against.""" import os.path +import random import time import bson @@ -10,8 +11,35 @@ import pymongo.write_concern import buildscripts.resmokelib.testing.fixtures.interface as interface +USE_LEGACY_MULTIVERSION = False -class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-instance-attributes + +def compare_timestamp(timestamp1, timestamp2): + """Compare the timestamp object ts part.""" + if timestamp1.time == timestamp2.time: + if timestamp1.inc < timestamp2.inc: + return -1 + elif timestamp1.inc > timestamp2.inc: + return 1 + else: + return 0 + elif timestamp1.time < timestamp2.time: + return -1 + else: + return 1 + + +def compare_optime(optime1, optime2): + """Compare timestamp object t part.""" + if optime1["t"] > optime2["t"]: + return 1 + elif optime1["t"] < optime2["t"]: + return -1 + else: + return compare_timestamp(optime1["ts"], optime2["ts"]) + + +class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-instance-attributes, too-many-public-methods """Fixture which provides JSTests with a replica set to run against.""" # Error response codes copied from mongo/base/error_codes.yml. @@ -513,11 +541,164 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst # of isMaster requests. continue + def stop_primary(self, primary, background_reconfig, kill): + """Stop the primary node method.""" + # Check that the fixture is still running before stepping down or killing the primary. + # This ensures we still detect some cases in which the fixture has already crashed. + if not self.is_running(): + raise pymongo.errors.ServerFailure("ReplicaSetFixture {} expected to be running in" + " ContinuousStepdown, but wasn't.".format( + self.replset_name)) + + # If we're running with background reconfigs, it's possible to be in a scenario + # where we kill a necessary voting node (i.e. in a 5 node repl set), only 2 are + # voting. In this scenario, we want to avoid killing the primary because no + # secondary can step up. + if background_reconfig: + # stagger the kill thread so that it runs a little after the reconfig thread + time.sleep(1) + voting_members = self.get_voting_members() + + self.logger.info("Current voting members: %s", voting_members) + + if len(voting_members) <= 3: + # Do not kill or terminate the primary if we don't have enough voting nodes to + # elect a new primary. + return False + + should_kill = kill and random.choice([True, False]) + action = "Killing" if should_kill else "Terminating" + self.logger.info("%s the primary on port %d of replica set '%s'.", action, primary.port, + self.replset_name) + + # We send the mongod process the signal to exit but don't immediately wait for it to + # exit because clean shutdown may take a while and we want to restore write availability + # as quickly as possible. + teardown_mode = interface.TeardownMode.KILL if should_kill else interface.TeardownMode.TERMINATE + primary.mongod.stop(mode=teardown_mode) + return True + + def change_version_and_restart_node(self, primary, auth_options): + """ + Select Secondary for stepUp. + + Ensure its version is different to that + of the old primary; change the version of the Secondary is needed. + """ + + def get_chosen_node_from_replsetstatus(status_member_infos): + max_optime = None + chosen_index = None + # We always select the secondary with highest optime to setup. + for member_info in status_member_infos: + if member_info.get("self", False): + # Ignore self, which is the old primary and not eligible + # to be re-elected in downgrade multiversion cluster. + continue + optime_dict = member_info["optime"] + if max_optime is None: + chosen_index = member_info["_id"] + max_optime = optime_dict + else: + if compare_optime(optime_dict, max_optime) > 0: + chosen_index = member_info["_id"] + max_optime = optime_dict + + if chosen_index is None or max_optime is None: + raise self.fixturelib.ServerFailure( + "Failed to find a secondary eligible for " + f"election; index: {chosen_index}, optime: {max_optime}") + + return self.nodes[chosen_index] + + primary_client = interface.authenticate(primary.mongo_client(), auth_options) + retry_time_secs = self.AWAIT_REPL_TIMEOUT_MINS * 60 + retry_start_time = time.time() + + while True: + member_infos = primary_client.admin.command({"replSetGetStatus": 1})["members"] + chosen_node = get_chosen_node_from_replsetstatus(member_infos) + + if chosen_node.change_version_if_needed(primary): + self.logger.info( + "Waiting for the chosen secondary on port %d of replica set '%s' to exit.", + chosen_node.port, self.replset_name) + + teardown_mode = interface.TeardownMode.TERMINATE + chosen_node.mongod.stop(mode=teardown_mode) + chosen_node.mongod.wait() + + self.logger.info( + "Attempting to restart the chosen secondary on port %d of replica set '%s.", + chosen_node.port, self.replset_name) + + chosen_node.setup() + self.logger.info(interface.create_fixture_table(self)) + chosen_node.await_ready() + + if self.stepup_node(chosen_node, auth_options): + break + + if time.time() - retry_start_time > retry_time_secs: + raise pymongo.errors.ServerFailure( + "The old primary on port {} of replica set {} did not step up in" + " {} seconds.".format(chosen_node.port, self.replset_name, retry_time_secs)) + + return chosen_node + + def stepup_node(self, node, auth_options): + """Try to step up the given node; return whether the attempt was successful.""" + try: + self.logger.info( + "Attempting to step up the chosen secondary on port %d of replica set '%s.", + node.port, self.replset_name) + client = interface.authenticate(node.mongo_client(), auth_options) + client.admin.command("replSetStepUp") + return True + except pymongo.errors.OperationFailure: + # OperationFailure exceptions are expected when the election attempt fails due to + # not receiving enough votes. This can happen when the 'chosen' secondary's opTime + # is behind that of other secondaries. We handle this by attempting to elect a + # different secondary. + self.logger.info("Failed to step up the secondary on port %d of replica set '%s'.", + node.port, self.replset_name) + return False + except pymongo.errors.AutoReconnect: + # It is possible for a replSetStepUp to fail with AutoReconnect if that node goes + # into Rollback (which causes it to close any open connections). + return False + + def restart_node(self, chosen): + """Restart the new step up node.""" + self.logger.info("Waiting for the old primary on port %d of replica set '%s' to exit.", + chosen.port, self.replset_name) + + chosen.mongod.wait() + + self.logger.info("Attempting to restart the old primary on port %d of replica set '%s.", + chosen.port, self.replset_name) + + # Restart the mongod on the old primary and wait until we can contact it again. Keep the + # original preserve_dbpath to restore after restarting the mongod. + original_preserve_dbpath = chosen.preserve_dbpath + chosen.preserve_dbpath = True + try: + chosen.setup() + self.logger.info(interface.create_fixture_table(self)) + chosen.await_ready() + finally: + chosen.preserve_dbpath = original_preserve_dbpath + def get_secondaries(self): """Return a list of secondaries from the replica set.""" primary = self.get_primary() return [node for node in self.nodes if node.port != primary.port] + def get_secondary_indices(self): + """Return a list of secondary indices from the replica set.""" + primary = self.get_primary() + return [index for index, node in enumerate(self.nodes) if node.port != primary.port] + def get_voting_members(self): """Return the number of voting nodes in the replica set.""" primary = self.get_primary() diff --git a/buildscripts/resmokelib/testing/fixtures/standalone.py b/buildscripts/resmokelib/testing/fixtures/standalone.py index 2f1923a150a..19925edb6be 100644 --- a/buildscripts/resmokelib/testing/fixtures/standalone.py +++ b/buildscripts/resmokelib/testing/fixtures/standalone.py @@ -17,7 +17,7 @@ class MongoDFixture(interface.Fixture): def __init__( # pylint: disable=too-many-arguments self, logger, job_num, fixturelib, mongod_executable=None, mongod_options=None, - dbpath_prefix=None, preserve_dbpath=False): + dbpath_prefix=None, preserve_dbpath=False, port=None): """Initialize MongoDFixture with different options for the mongod process.""" interface.Fixture.__init__(self, logger, job_num, fixturelib, dbpath_prefix=dbpath_prefix) self.mongod_options = self.fixturelib.make_historic( @@ -48,7 +48,7 @@ class MongoDFixture(interface.Fixture): self.preserve_dbpath = preserve_dbpath self.mongod = None - self.port = fixturelib.get_next_port(job_num) + self.port = port or fixturelib.get_next_port(job_num) self.mongod_options["port"] = self.port def setup(self): diff --git a/buildscripts/resmokelib/testing/hook_test_archival.py b/buildscripts/resmokelib/testing/hook_test_archival.py index d8944290394..38909056ce3 100644 --- a/buildscripts/resmokelib/testing/hook_test_archival.py +++ b/buildscripts/resmokelib/testing/hook_test_archival.py @@ -17,8 +17,6 @@ class HookTestArchival(object): self.archive_instance = archive_instance archive_config = utils.default_if_none(archive_config, {}) - self.on_success = archive_config.get("on_success", False) - self.tests = [] self.archive_all = False if "tests" in archive_config: @@ -50,8 +48,7 @@ class HookTestArchival(object): """ success = result.success - should_archive = (config.ARCHIVE_FILE and self.archive_instance - and (not success or self.on_success)) + should_archive = (config.ARCHIVE_FILE and self.archive_instance) and not success if not should_archive: return diff --git a/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py b/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py index 0b1293bb5cb..af3d9109083 100644 --- a/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py +++ b/buildscripts/resmokelib/testing/hooks/aggregate_metrics_background.py @@ -14,6 +14,8 @@ from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, class AggregateResourceConsumptionMetricsInBackground(jsfile.JSHook): """A hook to run $operationMetrics stage in the background.""" + IS_BACKGROUND = True + def __init__(self, hook_logger, fixture, shell_options=None): """Initialize AggregateResourceConsumptionMetricsInBackground.""" description = "Run background $operationMetrics on all mongods while a test is running" diff --git a/buildscripts/resmokelib/testing/hooks/cleanup.py b/buildscripts/resmokelib/testing/hooks/cleanup.py index 50011fe5201..2b0f3cc4dfc 100644 --- a/buildscripts/resmokelib/testing/hooks/cleanup.py +++ b/buildscripts/resmokelib/testing/hooks/cleanup.py @@ -11,6 +11,8 @@ class CleanEveryN(interface.Hook): On mongod-related fixtures, this will clear the dbpath. """ + IS_BACKGROUND = False + DEFAULT_N = 20 def __init__(self, hook_logger, fixture, n=DEFAULT_N): diff --git a/buildscripts/resmokelib/testing/hooks/cleanup_concurrency_workloads.py b/buildscripts/resmokelib/testing/hooks/cleanup_concurrency_workloads.py index d04f907d556..3f7f715a319 100644 --- a/buildscripts/resmokelib/testing/hooks/cleanup_concurrency_workloads.py +++ b/buildscripts/resmokelib/testing/hooks/cleanup_concurrency_workloads.py @@ -18,6 +18,8 @@ class CleanupConcurrencyWorkloads(interface.Hook): except for 'exclude_dbs' and the collection used by the test/workloads. """ + IS_BACKGROUND = False + def __init__( #pylint: disable=too-many-arguments self, hook_logger, fixture, exclude_dbs=None, same_collection=False, same_db=False): """Initialize CleanupConcurrencyWorkloads.""" diff --git a/buildscripts/resmokelib/testing/hooks/combine_benchmark_results.py b/buildscripts/resmokelib/testing/hooks/combine_benchmark_results.py index db637793d5c..3c329461307 100644 --- a/buildscripts/resmokelib/testing/hooks/combine_benchmark_results.py +++ b/buildscripts/resmokelib/testing/hooks/combine_benchmark_results.py @@ -19,6 +19,8 @@ class CombineBenchmarkResults(interface.Hook): DESCRIPTION = "Combine JSON results from individual benchmarks" + IS_BACKGROUND = False + def __init__(self, hook_logger, fixture): """Initialize CombineBenchmarkResults.""" interface.Hook.__init__(self, hook_logger, fixture, CombineBenchmarkResults.DESCRIPTION) diff --git a/buildscripts/resmokelib/testing/hooks/cpp_libfuzzer.py b/buildscripts/resmokelib/testing/hooks/cpp_libfuzzer.py index 4e59bf97d73..c0b6087544a 100644 --- a/buildscripts/resmokelib/testing/hooks/cpp_libfuzzer.py +++ b/buildscripts/resmokelib/testing/hooks/cpp_libfuzzer.py @@ -12,7 +12,9 @@ from buildscripts.resmokelib.testing.hooks import interface class LibfuzzerHook(interface.Hook): # pylint: disable=too-many-instance-attributes """Merges inputs after a fuzzer run.""" - DESCRIPTION = ("Merges inputs after a fuzzer run") + DESCRIPTION = "Merges inputs after a fuzzer run" + + IS_BACKGROUND = False def __init__(self, hook_logger, fixture): """Initialize the ContinuousStepdown. diff --git a/buildscripts/resmokelib/testing/hooks/dbhash.py b/buildscripts/resmokelib/testing/hooks/dbhash.py index 84e2de2c8f9..c3e20d5f141 100644 --- a/buildscripts/resmokelib/testing/hooks/dbhash.py +++ b/buildscripts/resmokelib/testing/hooks/dbhash.py @@ -12,6 +12,8 @@ class CheckReplDBHash(jsfile.DataConsistencyHook): match on the primary and secondaries. """ + IS_BACKGROUND = False + def __init__( # pylint: disable=super-init-not-called self, hook_logger, fixture, shell_options=None): """Initialize CheckReplDBHash.""" diff --git a/buildscripts/resmokelib/testing/hooks/dbhash_background.py b/buildscripts/resmokelib/testing/hooks/dbhash_background.py index f77d09d515e..b277e939abc 100644 --- a/buildscripts/resmokelib/testing/hooks/dbhash_background.py +++ b/buildscripts/resmokelib/testing/hooks/dbhash_background.py @@ -14,6 +14,8 @@ from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, class CheckReplDBHashInBackground(jsfile.JSHook): """A hook for comparing the dbhashes of all replica set members while a test is running.""" + IS_BACKGROUND = True + def __init__(self, hook_logger, fixture, shell_options=None): """Initialize CheckReplDBHashInBackground.""" description = "Check dbhashes of all replica set members while a test is running" diff --git a/buildscripts/resmokelib/testing/hooks/dbhash_tenant_migration.py b/buildscripts/resmokelib/testing/hooks/dbhash_tenant_migration.py index 9c4f24b1a34..f5582e6482c 100644 --- a/buildscripts/resmokelib/testing/hooks/dbhash_tenant_migration.py +++ b/buildscripts/resmokelib/testing/hooks/dbhash_tenant_migration.py @@ -12,6 +12,8 @@ class CheckTenantMigrationDBHash(jsfile.DataConsistencyHook): match on the primaries of the donor and recipient """ + IS_BACKGROUND = False + def __init__( # pylint: disable=super-init-not-called self, hook_logger, fixture, shell_options=None): """Initialize CheckTenantMigrationDBHash.""" diff --git a/buildscripts/resmokelib/testing/hooks/drop_sharded_collections.py b/buildscripts/resmokelib/testing/hooks/drop_sharded_collections.py index 73dbe869f5e..6cc7d66e3dc 100644 --- a/buildscripts/resmokelib/testing/hooks/drop_sharded_collections.py +++ b/buildscripts/resmokelib/testing/hooks/drop_sharded_collections.py @@ -11,6 +11,8 @@ class DropShardedCollections(jsfile.JSHook): With the exception of internal collections like config.system.sessions. """ + IS_BACKGROUND = False + def __init__( # pylint: disable=super-init-not-called self, hook_logger, fixture, shell_options=None): """.""" diff --git a/buildscripts/resmokelib/testing/hooks/enable_spurious_write_conflicts.py b/buildscripts/resmokelib/testing/hooks/enable_spurious_write_conflicts.py index 9ae552c0adf..913d67f8580 100644 --- a/buildscripts/resmokelib/testing/hooks/enable_spurious_write_conflicts.py +++ b/buildscripts/resmokelib/testing/hooks/enable_spurious_write_conflicts.py @@ -13,6 +13,8 @@ from buildscripts.resmokelib.testing.hooks import jsfile class EnableSpuriousWriteConflicts(interface.Hook): """Toggles write conflicts.""" + IS_BACKGROUND = False + def __init__( # pylint: disable=too-many-arguments self, hook_logger, fixture, shell_options=None): """Initialize ToggleWriteConflicts.""" diff --git a/buildscripts/resmokelib/testing/hooks/fuzzer_restore_settings.py b/buildscripts/resmokelib/testing/hooks/fuzzer_restore_settings.py index c559819accd..00277b12f58 100644 --- a/buildscripts/resmokelib/testing/hooks/fuzzer_restore_settings.py +++ b/buildscripts/resmokelib/testing/hooks/fuzzer_restore_settings.py @@ -12,6 +12,8 @@ from buildscripts.resmokelib.testing.hooks import jsfile class FuzzerRestoreSettings(jsfile.JSHook): """Cleans up unwanted changes from fuzzer.""" + IS_BACKGROUND = False + def __init__(self, hook_logger, fixture, shell_options=None): """Run fuzzer cleanup.""" description = "Clean up unwanted changes from fuzzer" diff --git a/buildscripts/resmokelib/testing/hooks/hello_failures.py b/buildscripts/resmokelib/testing/hooks/hello_failures.py index a4acdd63a6c..a1e4a94734b 100644 --- a/buildscripts/resmokelib/testing/hooks/hello_failures.py +++ b/buildscripts/resmokelib/testing/hooks/hello_failures.py @@ -17,6 +17,8 @@ from . import jsfile class HelloDelays(interface.Hook): """Sets Hello fault injections.""" + IS_BACKGROUND = False + def __init__(self, hook_logger, fixture): """Initialize HelloDelays.""" description = "Sets Hello fault injections" diff --git a/buildscripts/resmokelib/testing/hooks/initialsync.py b/buildscripts/resmokelib/testing/hooks/initialsync.py index 0061e5b0c44..80b0f0e1d0d 100644 --- a/buildscripts/resmokelib/testing/hooks/initialsync.py +++ b/buildscripts/resmokelib/testing/hooks/initialsync.py @@ -31,6 +31,8 @@ class BackgroundInitialSync(interface.Hook): DEFAULT_N = cleanup.CleanEveryN.DEFAULT_N + IS_BACKGROUND = True + def __init__(self, hook_logger, fixture, n=DEFAULT_N, shell_options=None): """Initialize BackgroundInitialSync.""" if not isinstance(fixture, replicaset.ReplicaSetFixture): @@ -45,14 +47,13 @@ class BackgroundInitialSync(interface.Hook): self.random_restarts = 0 self._shell_options = shell_options - def after_test(self, test, test_report): - """After test execution.""" - self.tests_run += 1 - + def before_test(self, test, test_report): + """Before test execution.""" hook_test_case = BackgroundInitialSyncTestCase.create_after_test( test.logger, test, self, self._shell_options) hook_test_case.configure(self.fixture) hook_test_case.run_dynamic_test(test_report) + self.tests_run += 1 class BackgroundInitialSyncTestCase(jsfile.DynamicJSTestCase): @@ -145,6 +146,8 @@ class IntermediateInitialSync(interface.Hook): DEFAULT_N = cleanup.CleanEveryN.DEFAULT_N + IS_BACKGROUND = False + def __init__(self, hook_logger, fixture, n=DEFAULT_N): """Initialize IntermediateInitialSync.""" if not isinstance(fixture, replicaset.ReplicaSetFixture): diff --git a/buildscripts/resmokelib/testing/hooks/interface.py b/buildscripts/resmokelib/testing/hooks/interface.py index f9739dd2575..6802ed2fde6 100644 --- a/buildscripts/resmokelib/testing/hooks/interface.py +++ b/buildscripts/resmokelib/testing/hooks/interface.py @@ -24,6 +24,10 @@ class Hook(object, metaclass=registry.make_registry_metaclass(_HOOKS)): # pylin REGISTERED_NAME = registry.LEAVE_UNREGISTERED + # Whether the hook runs in the background of a test. Typically background jobs start their own threads, + # except for Server-side background activity like initial sync, which is also considered background. + IS_BACKGROUND = None + def __init__(self, hook_logger, fixture, description): """Initialize the Hook with the specified fixture.""" @@ -31,6 +35,10 @@ class Hook(object, metaclass=registry.make_registry_metaclass(_HOOKS)): # pylin self.fixture = fixture self.description = description + if self.IS_BACKGROUND is None: + raise ValueError( + "Concrete Hook subclasses must override the IS_BACKGROUND class property") + def before_suite(self, test_report): """Test runner calls this exactly once before they start running the suite.""" pass diff --git a/buildscripts/resmokelib/testing/hooks/oplog.py b/buildscripts/resmokelib/testing/hooks/oplog.py index 525c36d97f6..0dc0bc79f88 100644 --- a/buildscripts/resmokelib/testing/hooks/oplog.py +++ b/buildscripts/resmokelib/testing/hooks/oplog.py @@ -8,6 +8,8 @@ from buildscripts.resmokelib.testing.hooks import jsfile class CheckReplOplogs(jsfile.DataConsistencyHook): # pylint: disable=non-parent-init-called,super-init-not-called """Check that local.oplog.rs matches on the primary and secondaries.""" + IS_BACKGROUND = False + def __init__( # pylint: disable=super-init-not-called self, hook_logger, fixture, shell_options=None): """Initialize CheckReplOplogs.""" diff --git a/buildscripts/resmokelib/testing/hooks/orphans.py b/buildscripts/resmokelib/testing/hooks/orphans.py index 770faf0eb5f..38d62f51010 100644 --- a/buildscripts/resmokelib/testing/hooks/orphans.py +++ b/buildscripts/resmokelib/testing/hooks/orphans.py @@ -9,6 +9,8 @@ from buildscripts.resmokelib.testing.hooks import jsfile class CheckOrphansDeleted(jsfile.DataConsistencyHook): """Check if the range deleter failed to delete any orphan documents.""" + IS_BACKGROUND = False + def __init__(self, hook_logger, fixture, shell_options=None): """Initialize CheckOrphansDeleted.""" diff --git a/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py b/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py index dfb4f707fce..48079ae27dd 100644 --- a/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py +++ b/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py @@ -22,6 +22,8 @@ class PeriodicKillSecondaries(interface.Hook): to the primary after an unclean shutdown. """ + IS_BACKGROUND = False + DEFAULT_PERIOD_SECS = 30 def __init__(self, hook_logger, rs_fixture, period_secs=DEFAULT_PERIOD_SECS): diff --git a/buildscripts/resmokelib/testing/hooks/reconfig_background.py b/buildscripts/resmokelib/testing/hooks/reconfig_background.py index fbd29d4d625..1c72c9f10e2 100644 --- a/buildscripts/resmokelib/testing/hooks/reconfig_background.py +++ b/buildscripts/resmokelib/testing/hooks/reconfig_background.py @@ -13,6 +13,8 @@ from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, class DoReconfigInBackground(jsfile.JSHook): """A hook for running a safe reconfig against a replica set while a test is running.""" + IS_BACKGROUND = True + def __init__(self, hook_logger, fixture, shell_options=None): """Initialize DoReconfigInBackground.""" description = "Run reconfigs against the primary while the test is running." diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py index 03ca3760219..3713629e75c 100644 --- a/buildscripts/resmokelib/testing/hooks/stepdown.py +++ b/buildscripts/resmokelib/testing/hooks/stepdown.py @@ -6,7 +6,6 @@ import random import threading import time -import bson import pymongo.errors import buildscripts.resmokelib.utils.filesystem as fs @@ -24,11 +23,13 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a DESCRIPTION = ("Continuous stepdown (steps down the primary of replica sets at regular" " intervals)") + IS_BACKGROUND = True + def __init__( # pylint: disable=too-many-arguments self, hook_logger, fixture, config_stepdown=True, shard_stepdown=True, stepdown_interval_ms=8000, terminate=False, kill=False, use_stepdown_permitted_file=False, wait_for_mongos_retarget=False, - stepdown_via_heartbeats=True, background_reconfig=False, auth_options=None): + background_reconfig=False, auth_options=None, should_downgrade=False): """Initialize the ContinuousStepdown. Args: @@ -42,7 +43,6 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a use_stepdown_permitted_file: use a file to control if stepdown thread should do a stepdown. wait_for_mongos_retarget: whether to run validate on all mongoses for each collection in each database, after pausing the stepdown thread. - stepdown_via_heartbeats: step up secondaries instead of stepping down primary. Note that the "terminate" and "kill" arguments are named after the "SIGTERM" and "SIGKILL" signals that are used to stop the process. On Windows, there are no signals, @@ -55,7 +55,6 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a self._shard_stepdown = shard_stepdown self._stepdown_interval_secs = float(stepdown_interval_ms) / 1000 self._wait_for_mongos_retarget = wait_for_mongos_retarget - self._stepdown_via_heartbeats = stepdown_via_heartbeats self._rs_fixtures = [] self._mongos_fixtures = [] @@ -67,6 +66,7 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a self._background_reconfig = background_reconfig self._auth_options = auth_options + self._should_downgrade = should_downgrade # The stepdown file names need to match the same construction as found in # jstests/concurrency/fsm_libs/resmoke_runner.js. @@ -91,8 +91,7 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a self._stepdown_thread = _StepdownThread( self.logger, self._mongos_fixtures, self._rs_fixtures, self._stepdown_interval_secs, self._terminate, self._kill, lifecycle, self._wait_for_mongos_retarget, - self._stepdown_via_heartbeats, self._background_reconfig, self._fixture, - self._auth_options) + self._background_reconfig, self._fixture, self._auth_options, self._should_downgrade) self.logger.info("Starting the stepdown thread.") self._stepdown_thread.start() @@ -353,8 +352,8 @@ class FileBasedStepdownLifecycle(object): class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-attributes def __init__( # pylint: disable=too-many-arguments self, logger, mongos_fixtures, rs_fixtures, stepdown_interval_secs, terminate, kill, - stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats, - background_reconfig, fixture, auth_options=None): + stepdown_lifecycle, wait_for_mongos_retarget, background_reconfig, fixture, + auth_options=None, should_downgrade=False): """Initialize _StepdownThread.""" threading.Thread.__init__(self, name="StepdownThread") self.daemon = True @@ -370,10 +369,10 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at self._kill = kill self.__lifecycle = stepdown_lifecycle self._should_wait_for_mongos_retarget = wait_for_mongos_retarget - self._stepdown_via_heartbeats = stepdown_via_heartbeats self._background_reconfig = background_reconfig self._fixture = fixture self._auth_options = auth_options + self._should_downgrade = should_downgrade self._last_exec = time.time() # Event set when the thread has been stopped using the 'stop()' method. @@ -493,7 +492,7 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at # pylint: disable=R0912,R0914,R0915 def _step_down(self, rs_fixture): try: - primary = rs_fixture.get_primary(timeout_secs=self._stepdown_interval_secs) + old_primary = rs_fixture.get_primary(timeout_secs=self._stepdown_interval_secs) except errors.ServerFailure: # We ignore the ServerFailure exception because it means a primary wasn't available. # We'll try again after self._stepdown_interval_secs seconds. @@ -501,141 +500,49 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at secondaries = rs_fixture.get_secondaries() - # Check that the fixture is still running before stepping down or killing the primary. - # This ensures we still detect some cases in which the fixture has already crashed. - if not rs_fixture.is_running(): - raise errors.ServerFailure("ReplicaSetFixture {} expected to be running in" - " ContinuousStepdown, but wasn't.".format( - rs_fixture.replset_name)) + if self._terminate: + if not rs_fixture.stop_primary(old_primary, self._background_reconfig, self._kill): + return + + if self._should_downgrade: + new_primary = rs_fixture.change_version_and_restart_node(old_primary, + self._auth_options) + else: + + def step_up_secondary(): + while secondaries: + chosen = random.choice(secondaries) + if not rs_fixture.stepup_node(chosen, self._auth_options): + secondaries.remove(chosen) + else: + return chosen + + new_primary = step_up_secondary() if self._terminate: - # If we're running with background reconfigs, it's possible to be in a scenario - # where we kill a necessary voting node (i.e. in a 5 node repl set), only 2 are - # voting. In this scenario, we want to avoid killing the primary because no - # secondary can step up. - if self._background_reconfig: - # stagger the kill thread so that it runs a little after the reconfig thread - time.sleep(1) - voting_members = rs_fixture.get_voting_members() + rs_fixture.restart_node(old_primary) - self.logger.info("Current voting members: %s", voting_members) - - if len(voting_members) <= 3: - # Do not kill or terminate the primary if we don't have enough voting nodes to - # elect a new primary. - return - - should_kill = self._kill and random.choice([True, False]) - action = "Killing" if should_kill else "Terminating" - self.logger.info("%s the primary on port %d of replica set '%s'.", action, primary.port, - rs_fixture.replset_name) - - # We send the mongod process the signal to exit but don't immediately wait for it to - # exit because clean shutdown may take a while and we want to restore write availability - # as quickly as possible. - teardown_mode = fixture_interface.TeardownMode.KILL if should_kill else fixture_interface.TeardownMode.TERMINATE - primary.mongod.stop(mode=teardown_mode) - elif not self._stepdown_via_heartbeats: - self.logger.info("Stepping down the primary on port %d of replica set '%s'.", - primary.port, rs_fixture.replset_name) - try: - client = self._create_client(primary) - client.admin.command( - bson.SON([ - ("replSetStepDown", self._stepdown_duration_secs), - ("force", True), - ])) - except pymongo.errors.AutoReconnect: - # AutoReconnect exceptions are expected as connections are closed during stepdown. - pass - except pymongo.errors.PyMongoError: - self.logger.exception( - "Error while stepping down the primary on port %d of replica set '%s'.", - primary.port, rs_fixture.replset_name) - raise - - # We have skipped stepping down the primary if we want to step up secondaries instead. Here, - # we simply need to pick an arbitrary secondary to run for election which will lead to - # unconditional stepdown on the primary. - # - # If we have terminated/killed/stepped down the primary above, write availability has lost. - # We pick an arbitrary secondary to run for election immediately in order to avoid a long - # period where the replica set doesn't have write availability. If none of the secondaries - # are eligible, or their election attempt fails, then we'll run the replSetStepUp command on - # 'primary' to ensure we have write availability sooner than the - # self._stepdown_duration_secs duration expires. - while secondaries: - chosen = random.choice(secondaries) - - self.logger.info("Attempting to step up the secondary on port %d of replica set '%s'.", - chosen.port, rs_fixture.replset_name) - - try: - client = self._create_client(chosen) - client.admin.command("replSetStepUp") - break - except pymongo.errors.OperationFailure: - # OperationFailure exceptions are expected when the election attempt fails due to - # not receiving enough votes. This can happen when the 'chosen' secondary's opTime - # is behind that of other secondaries. We handle this by attempting to elect a - # different secondary. - self.logger.info("Failed to step up the secondary on port %d of replica set '%s'.", - chosen.port, rs_fixture.replset_name) - secondaries.remove(chosen) - except pymongo.errors.AutoReconnect: - # It is possible for a replSetStepUp to fail with AutoReconnect if that node goes - # into Rollback (which causes it to close any open connections). - pass - - if self._terminate: - self.logger.info("Waiting for the old primary on port %d of replica set '%s' to exit.", - primary.port, rs_fixture.replset_name) - - primary.mongod.wait() - - self.logger.info("Attempting to restart the old primary on port %d of replica set '%s.", - primary.port, rs_fixture.replset_name) - - # Restart the mongod on the old primary and wait until we can contact it again. Keep the - # original preserve_dbpath to restore after restarting the mongod. - original_preserve_dbpath = primary.preserve_dbpath - primary.preserve_dbpath = True - try: - primary.setup() - self.logger.info(fixture_interface.create_fixture_table(self._fixture)) - primary.await_ready() - finally: - primary.preserve_dbpath = original_preserve_dbpath - elif not self._stepdown_via_heartbeats: - # If we chose to step up a secondary instead, the primary was never stepped down via the - # replSetStepDown command and thus will not have a stepdown period. So we can skip - # running {replSetFreeze: 0}. Otherwise, the replSetStepDown command run earlier - # introduced a stepdown period on the former primary and so we have to run the - # {replSetFreeze: 0} command to ensure the former primary is electable in the next - # round of _step_down(). - client = self._create_client(primary) - client.admin.command({"replSetFreeze": 0}) - elif secondaries: + if secondaries: # We successfully stepped up a secondary, wait for the former primary to step down via # heartbeats. We need to wait for the former primary to step down to complete this step # down round and to avoid races between the ContinuousStepdown hook and other test hooks # that may depend on the health of the replica set. self.logger.info( "Successfully stepped up the secondary on port %d of replica set '%s'.", - chosen.port, rs_fixture.replset_name) + new_primary.port, rs_fixture.replset_name) while True: try: - client = self._create_client(primary) + client = self._create_client(old_primary) is_secondary = client.admin.command("isMaster")["secondary"] if is_secondary: break except pymongo.errors.AutoReconnect: pass self.logger.info("Waiting for primary on port %d of replica set '%s' to step down.", - primary.port, rs_fixture.replset_name) + old_primary.port, rs_fixture.replset_name) time.sleep(0.2) # Wait a little bit before trying again. - self.logger.info("Primary on port %d of replica set '%s' stepped down.", primary.port, - rs_fixture.replset_name) + self.logger.info("Primary on port %d of replica set '%s' stepped down.", + old_primary.port, rs_fixture.replset_name) if not secondaries: # If we failed to step up one of the secondaries, then we run the replSetStepUp to try @@ -643,11 +550,12 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at # self._stepdown_duration_secs seconds to restore write availability to the cluster. # Since the former primary may have been killed, we need to wait until it has been # restarted by retrying replSetStepUp. + retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60 retry_start_time = time.time() while True: try: - client = self._create_client(primary) + client = self._create_client(old_primary) client.admin.command("replSetStepUp") break except pymongo.errors.OperationFailure: @@ -660,8 +568,9 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at # Bump the counter for the chosen secondary to indicate that the replSetStepUp command # executed successfully. - key = "{}/{}".format(rs_fixture.replset_name, - chosen.get_internal_connection_string() if secondaries else "none") + key = "{}/{}".format( + rs_fixture.replset_name, + new_primary.get_internal_connection_string() if secondaries else "none") self._step_up_stats[key] += 1 def _do_wait_for_mongos_retarget(self): # pylint: disable=too-many-branches diff --git a/buildscripts/resmokelib/testing/hooks/tenant_migration.py b/buildscripts/resmokelib/testing/hooks/tenant_migration.py index c6799241bce..bfadad6799d 100644 --- a/buildscripts/resmokelib/testing/hooks/tenant_migration.py +++ b/buildscripts/resmokelib/testing/hooks/tenant_migration.py @@ -23,6 +23,8 @@ class ContinuousTenantMigration(interface.Hook): # pylint: disable=too-many-ins DESCRIPTION = ("Continuous tenant migrations") + IS_BACKGROUND = True + def __init__(self, hook_logger, fixture, shell_options): """Initialize the ContinuousTenantMigration. diff --git a/buildscripts/resmokelib/testing/hooks/validate.py b/buildscripts/resmokelib/testing/hooks/validate.py index 40b9970cc1d..c10b75b8e00 100644 --- a/buildscripts/resmokelib/testing/hooks/validate.py +++ b/buildscripts/resmokelib/testing/hooks/validate.py @@ -12,6 +12,8 @@ class ValidateCollections(jsfile.DataConsistencyHook): node, primary replica-set node, or primary shard node. """ + IS_BACKGROUND = False + def __init__( # pylint: disable=super-init-not-called self, hook_logger, fixture, shell_options=None): """Initialize ValidateCollections.""" diff --git a/buildscripts/resmokelib/testing/hooks/validate_background.py b/buildscripts/resmokelib/testing/hooks/validate_background.py index b873c8a5d4f..e2953fd7183 100644 --- a/buildscripts/resmokelib/testing/hooks/validate_background.py +++ b/buildscripts/resmokelib/testing/hooks/validate_background.py @@ -15,6 +15,8 @@ from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, class ValidateCollectionsInBackground(jsfile.JSHook): """A hook to run background collection validation against test servers while a test is running.""" + IS_BACKGROUND = True + def __init__(self, hook_logger, fixture, shell_options=None): """Initialize ValidateCollectionsInBackground.""" description = "Run background collection validation against all mongods while a test is running" diff --git a/buildscripts/resmokelib/testing/hooks/wait_for_replication.py b/buildscripts/resmokelib/testing/hooks/wait_for_replication.py index c771a3008cb..e3308a60301 100644 --- a/buildscripts/resmokelib/testing/hooks/wait_for_replication.py +++ b/buildscripts/resmokelib/testing/hooks/wait_for_replication.py @@ -10,6 +10,8 @@ from buildscripts.resmokelib.testing.hooks import interface class WaitForReplication(interface.Hook): """Wait for replication to complete.""" + IS_BACKGROUND = False + def __init__(self, hook_logger, fixture): """Initialize WaitForReplication.""" description = "WaitForReplication waits on a replica set" diff --git a/buildscripts/resmokelib/testing/job.py b/buildscripts/resmokelib/testing/job.py index 5c2bf12c468..3361b7c6c09 100644 --- a/buildscripts/resmokelib/testing/job.py +++ b/buildscripts/resmokelib/testing/job.py @@ -197,11 +197,16 @@ class Job(object): # pylint: disable=too-many-instance-attributes "%s not running after %s" % (self.fixture, test.short_description())) finally: success = self.report.find_test_info(test).status == "pass" + + # Stop background hooks first since they can interfere with fixture startup and teardown + # done as part of archival. + self._run_hooks_after_tests(test, background=True) + if self.archival: result = TestResult(test=test, hook=None, success=success) self.archival.archive(self.logger, result, self.manager) - self._run_hooks_after_tests(test) + self._run_hooks_after_tests(test, background=False) def _run_hook(self, hook, hook_function, test): """Provide helper to run hook and archival.""" @@ -247,15 +252,19 @@ class Job(object): # pylint: disable=too-many-instance-attributes self.report.stopTest(test) raise - def _run_hooks_after_tests(self, test): + def _run_hooks_after_tests(self, test, background=False): """Run the after_test method on each of the hooks. Swallows any TestFailure exceptions if set to continue on failure, and reraises any other exceptions. + + @param test: the test after which we run the hooks. + @param background: whether to run background hooks. """ try: for hook in self.hooks: - self._run_hook(hook, hook.after_test, test) + if hook.IS_BACKGROUND == background: + self._run_hook(hook, hook.after_test, test) except errors.StopExecution: raise diff --git a/buildscripts/tests/resmokelib/testing/hooks/test_stepdown.py b/buildscripts/tests/resmokelib/testing/hooks/test_stepdown.py index cbc1a9f95ae..28e93b73910 100644 --- a/buildscripts/tests/resmokelib/testing/hooks/test_stepdown.py +++ b/buildscripts/tests/resmokelib/testing/hooks/test_stepdown.py @@ -36,7 +36,6 @@ class TestStepdownThread(unittest.TestCase): kill=False, stepdown_lifecycle=_stepdown.FlagBasedStepdownLifecycle(), wait_for_mongos_retarget=False, - stepdown_via_heartbeats=True, background_reconfig=False, fixture=shardcluster_fixture, ) diff --git a/buildscripts/tests/test_feature_flag_tags_check.py b/buildscripts/tests/test_feature_flag_tags_check.py index 292ed05a1d6..26593992184 100644 --- a/buildscripts/tests/test_feature_flag_tags_check.py +++ b/buildscripts/tests/test_feature_flag_tags_check.py @@ -9,7 +9,7 @@ from buildscripts import feature_flag_tags_check class TestFindTestsInGitDiff(unittest.TestCase): @classmethod def setUpClass(cls): - cls.requires_fcv_tag = "requires_fcv_50" + cls.requires_fcv_tag = "requires_fcv_51" cls.original_requires_fcv_tag = feature_flag_tags_check.REQUIRES_FCV_TAG feature_flag_tags_check.REQUIRES_FCV_TAG = cls.requires_fcv_tag diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 6c1a9835a49..75dbcc7904f 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -1,4 +1,4 @@ -##################################################### +#################################################### # YAML Conventions # ##################################################### # Please see our conventions document at @@ -5693,6 +5693,17 @@ tasks: use_large_distro: "true" resmoke_args: --storageEngine=wiredTiger +- <<: *gen_task_template + name: retryable_writes_downgrade_passthrough_gen + tags: ["multiversion_passthrough"] + commands: + - func: "generate resmoke tasks" + vars: + suite: retryable_writes_downgrade_passthrough + resmoke_args: --storageEngine=wiredTiger + require_multiversion: true + implicit_multiversion: true + - <<: *gen_task_template name: logical_session_cache_replication_default_refresh_jscore_passthrough_gen tags: ["logical_session_cache", "repl"] diff --git a/jstests/core/api_version_new_50_language_features.js b/jstests/core/api_version_new_50_language_features.js index 9344129b300..a2c703cd8b3 100644 --- a/jstests/core/api_version_new_50_language_features.js +++ b/jstests/core/api_version_new_50_language_features.js @@ -3,9 +3,12 @@ * yet. This test should be updated or removed in a future release when we have more confidence that * the behavior and syntax is stable. * + * TODO: SERVER-58962 remove requires_fcv_51 from this file or change the test code. + * * @tags: [ * requires_fcv_51, * uses_api_parameters, + * requires_fcv_51, * ] */ diff --git a/jstests/core/doc_validation_error.js b/jstests/core/doc_validation_error.js index e517d4f333d..22815a5d787 100644 --- a/jstests/core/doc_validation_error.js +++ b/jstests/core/doc_validation_error.js @@ -1,8 +1,9 @@ /** * Tests document validation errors with sample validators. Note that it uses some JSON Schemas from * schemastore.org. + * * @tags: [ - * requires_fcv_51, + * requires_fcv_51 * ] */ (function() { diff --git a/jstests/core/mod_special_values.js b/jstests/core/mod_special_values.js index 7b39bcc9a4a..adc1a27958d 100644 --- a/jstests/core/mod_special_values.js +++ b/jstests/core/mod_special_values.js @@ -1,9 +1,8 @@ /** * Tests $mod match expression with NaN, Infinity and large value inputs. - * @tags: [ - * # This test exercises a changed behavior, thus prevent it running in multi-version variants. - * requires_fcv_51, - * ] + * + * This test exercises a changed behavior, thus prevent it running in multi-version variants. + * @tags: [requires_fcv_51] */ (function() { "use strict"; diff --git a/jstests/core/regex_options.js b/jstests/core/regex_options.js index d6c95e42559..120a3a9c9a5 100644 --- a/jstests/core/regex_options.js +++ b/jstests/core/regex_options.js @@ -1,8 +1,6 @@ /** * Test regex options in a find context. - * @tags: [ - * requires_fcv_51, - * ] + * @tags: [requires_fcv_51] */ (function() { 'use strict';