diff --git a/buildscripts/resmokelib/core/jasper_process.py b/buildscripts/resmokelib/core/jasper_process.py index 941d4cfbe47..e6839760776 100644 --- a/buildscripts/resmokelib/core/jasper_process.py +++ b/buildscripts/resmokelib/core/jasper_process.py @@ -10,6 +10,7 @@ try: except ImportError: pass +from buildscripts.resmokelib.testing.fixtures import interface as fixture_interface from . import process as _process @@ -49,11 +50,13 @@ class Process(_process.Process): self._id = self.jasper_pb2.JasperProcessID(value=val.id) self._return_code = None - def stop(self, kill=False): + def stop(self, mode=None): """Terminate the process.""" + + should_kill = mode == fixture_interface.TerminationMode.KILL signal = self.jasper_pb2.Signals.Value("TERMINATE") if sys.platform == "win32": - if not kill: + if not should_kill: event_name = self.jasper_pb2.EventName(value="Global\\Mongo_" + str(self.pid)) signal_event = self._stub.SignalEvent(event_name) if signal_event.success: @@ -64,7 +67,7 @@ class Process(_process.Process): processID=self._id, signalTriggerID=self.jasper_pb2.SignalTriggerID.Value("CLEANTERMINATION")) self._stub.RegisterSignalTriggerID(clean_termination_params) - elif kill: + elif should_kill: signal = self.jasper_pb2.Signals.Value("KILL") signal_process = self.jasper_pb2.SignalProcess(ProcessID=self._id, signal=signal) diff --git a/buildscripts/resmokelib/core/process.py b/buildscripts/resmokelib/core/process.py index 2c49eba1e12..59d533b534b 100644 --- a/buildscripts/resmokelib/core/process.py +++ b/buildscripts/resmokelib/core/process.py @@ -12,6 +12,8 @@ import sys import threading import subprocess +from buildscripts.resmokelib.testing.fixtures import interface as fixture_interface +from buildscripts.resmokelib import errors from . import pipe # pylint: disable=wrong-import-position from .. import utils # pylint: disable=wrong-import-position @@ -130,12 +132,16 @@ class Process(object): if return_code == win32con.STILL_ACTIVE: raise - def stop(self, kill=False): # pylint: disable=too-many-branches + def stop(self, mode=None): # pylint: disable=too-many-branches """Terminate the process.""" + if mode is None: + mode = fixture_interface.TeardownMode.TERMINATE + if sys.platform == "win32": # Attempt to cleanly shutdown mongod. - if not kill and self.args and self.args[0].find("mongod") != -1: + if mode != fixture_interface.TeardownMode.KILL and self.args and self.args[0].find( + "mongod") != -1: mongo_signal_handle = None try: mongo_signal_handle = win32event.OpenEvent( @@ -180,10 +186,16 @@ class Process(object): raise else: try: - if kill: + if mode == fixture_interface.TeardownMode.KILL: self._process.kill() - else: + elif mode == fixture_interface.TeardownMode.TERMINATE: self._process.terminate() + elif mode == fixture_interface.TeardownMode.ABORT: + self._process.send_signal(mode.value) + else: + raise errors.ProcessError("Process wrapper given unrecognized teardown mode: " + + mode.value) + except OSError as err: # ESRCH (errno=3) is received when the process has already died. if err.errno != 3: diff --git a/buildscripts/resmokelib/errors.py b/buildscripts/resmokelib/errors.py index a0bddfdbc23..0c344e168bb 100644 --- a/buildscripts/resmokelib/errors.py +++ b/buildscripts/resmokelib/errors.py @@ -50,3 +50,13 @@ class PortAllocationError(ResmokeError): # noqa: D204 fixture requests more ports than were reserved for that job. """ pass + + +class ProcessError(ResmokeError): + """Exception raised in the process wrapper. + + Raised if a termination mode is given to the process wrapper that it doesn't + know how to send a signal for. + """ + + pass diff --git a/buildscripts/resmokelib/testing/fixtures/interface.py b/buildscripts/resmokelib/testing/fixtures/interface.py index 7eebc5071b4..fd98171a0de 100644 --- a/buildscripts/resmokelib/testing/fixtures/interface.py +++ b/buildscripts/resmokelib/testing/fixtures/interface.py @@ -2,6 +2,7 @@ import os.path import time +from enum import Enum import pymongo import pymongo.errors @@ -16,6 +17,19 @@ from ...utils import registry _FIXTURES = {} # type: ignore +class TeardownMode(Enum): + """ + Enumeration representing different ways a fixture can be torn down. + + Each constant has the value of a Linux signal, even though the signal won't be used on Windows. + This class is used because the 'signal' package on Windows has different values. + """ + + TERMINATE = 15 + KILL = 9 + ABORT = 6 + + def make_fixture(class_name, *args, **kwargs): """Provide factory function for creating Fixture instances.""" @@ -65,7 +79,7 @@ class Fixture(object, metaclass=registry.make_registry_metaclass(_FIXTURES)): """Block until the fixture can be used for testing.""" pass - def teardown(self, finished=False, kill=False): # noqa + def teardown(self, finished=False, mode=None): # noqa """Destroy the fixture. The fixture's logging handlers are closed if 'finished' is true, @@ -76,7 +90,7 @@ class Fixture(object, metaclass=registry.make_registry_metaclass(_FIXTURES)): """ try: - self._do_teardown(kill=kill) + self._do_teardown(mode=mode) finally: if finished: for handler in self.logger.handlers: @@ -84,7 +98,7 @@ class Fixture(object, metaclass=registry.make_registry_metaclass(_FIXTURES)): # want the logs to eventually get flushed. logging.flush.close_later(handler) - def _do_teardown(self, kill=False): # noqa + def _do_teardown(self, mode=None): # noqa """Destroy the fixture. This method must be implemented by subclasses. @@ -243,7 +257,7 @@ class FixtureTeardownHandler(object): """ return self._message - def teardown(self, fixture, name, kill=False): # noqa: D406,D407,D411,D413 + def teardown(self, fixture, name, mode=None): # noqa: D406,D407,D411,D413 """Tear down the given fixture and log errors instead of raising a ServerFailure exception. Args: @@ -254,7 +268,7 @@ class FixtureTeardownHandler(object): """ try: self._logger.info("Stopping %s...", name) - fixture.teardown(kill=kill) + fixture.teardown(mode=mode) self._logger.info("Successfully stopped %s.", name) return True except errors.ServerFailure as err: diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py index 97b8697312a..1f02b770fd6 100644 --- a/buildscripts/resmokelib/testing/fixtures/replicaset.py +++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py @@ -400,7 +400,7 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst primary = self.nodes[0] primary.mongo_client().admin.command(cmd) - def _do_teardown(self, kill=False): + def _do_teardown(self, mode=None): self.logger.info("Stopping all members of the replica set...") running_at_start = self.is_running() @@ -411,11 +411,11 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst teardown_handler = interface.FixtureTeardownHandler(self.logger) if self.initial_sync_node: - teardown_handler.teardown(self.initial_sync_node, "initial sync node", kill=kill) + teardown_handler.teardown(self.initial_sync_node, "initial sync node", mode=mode) # Terminate the secondaries first to reduce noise in the logs. for node in reversed(self.nodes): - teardown_handler.teardown(node, "replica set member on port %d" % node.port, kill=kill) + teardown_handler.teardown(node, "replica set member on port %d" % node.port, mode=mode) if teardown_handler.was_successful(): self.logger.info("Successfully stopped all members of the replica set.") diff --git a/buildscripts/resmokelib/testing/fixtures/shardedcluster.py b/buildscripts/resmokelib/testing/fixtures/shardedcluster.py index da1e3f575ab..50e7acbaecd 100644 --- a/buildscripts/resmokelib/testing/fixtures/shardedcluster.py +++ b/buildscripts/resmokelib/testing/fixtures/shardedcluster.py @@ -191,7 +191,7 @@ class ShardedClusterFixture(interface.Fixture): # pylint: disable=too-many-inst client.admin.command({"balancerStart": 1}, maxTimeMS=timeout_ms) self.logger.info("Started the balancer") - def _do_teardown(self, kill=False): + def _do_teardown(self, mode=None): """Shut down the sharded cluster.""" self.logger.info("Stopping all members of the sharded cluster...") @@ -200,21 +200,22 @@ class ShardedClusterFixture(interface.Fixture): # pylint: disable=too-many-inst self.logger.warning("All members of the sharded cluster were expected to be running, " "but weren't.") - # If we're killing to archive data files, stopping the balancer will execute + # If we're killing or aborting to archive data files, stopping the balancer will execute # server commands that might lead to on-disk changes from the point of failure. - if self.enable_balancer and not kill: + if self.enable_balancer and mode not in (interface.TeardownMode.KILL, + interface.TeardownMode.ABORT): self.stop_balancer() teardown_handler = interface.FixtureTeardownHandler(self.logger) if self.configsvr is not None: - teardown_handler.teardown(self.configsvr, "config server", kill=kill) + teardown_handler.teardown(self.configsvr, "config server", mode=mode) for mongos in self.mongos: - teardown_handler.teardown(mongos, "mongos", kill=kill) + teardown_handler.teardown(mongos, "mongos", mode=mode) for shard in self.shards: - teardown_handler.teardown(shard, "shard", kill=kill) + teardown_handler.teardown(shard, "shard", mode=mode) if teardown_handler.was_successful(): self.logger.info("Successfully stopped all members of the sharded cluster.") @@ -431,12 +432,18 @@ class _MongoSFixture(interface.Fixture): self.logger.info("Successfully contacted the mongos on port %d.", self.port) - def _do_teardown(self, kill=False): + def _do_teardown(self, mode=None): if self.mongos is None: self.logger.warning("The mongos fixture has not been set up yet.") return # Teardown is still a success even if nothing is running. - self.logger.info("Stopping mongos on port %d with pid %d...", self.port, self.mongos.pid) + if mode == interface.TeardownMode.ABORT: + self.logger.info( + "Attempting to send SIGABRT from resmoke to mongos on port %d with pid %d...", + self.port, self.mongos.pid) + else: + self.logger.info("Stopping mongos on port %d with pid %d...", self.port, + self.mongos.pid) if not self.is_running(): exit_code = self.mongos.poll() msg = ("mongos on port {:d} was expected to be running, but wasn't. " @@ -444,12 +451,12 @@ class _MongoSFixture(interface.Fixture): self.logger.warning(msg) raise errors.ServerFailure(msg) - self.mongos.stop(kill=kill) + self.mongos.stop(mode=mode) exit_code = self.mongos.wait() - # SIGKILL has an exit code of 9 and Python's subprocess module returns - # negative versions of system calls. - if exit_code == 0 or (exit_code == -9 and kill): + # Python's subprocess module returns negative versions of system calls. + # pylint: disable=invalid-unary-operand-type + if exit_code == 0 or (mode is not None and exit_code == -(mode.value)): self.logger.info("Successfully stopped the mongos on port {:d}".format(self.port)) else: self.logger.warning("Stopped the mongos on port {:d}. " diff --git a/buildscripts/resmokelib/testing/fixtures/standalone.py b/buildscripts/resmokelib/testing/fixtures/standalone.py index 3c54f41f689..d4b1e0505ff 100644 --- a/buildscripts/resmokelib/testing/fixtures/standalone.py +++ b/buildscripts/resmokelib/testing/fixtures/standalone.py @@ -111,12 +111,18 @@ class MongoDFixture(interface.Fixture): self.logger.info("Successfully contacted the mongod on port %d.", self.port) - def _do_teardown(self, kill=False): + def _do_teardown(self, mode=None): if self.mongod is None: self.logger.warning("The mongod fixture has not been set up yet.") return # Still a success even if nothing is running. - self.logger.info("Stopping mongod on port %d with pid %d...", self.port, self.mongod.pid) + if mode == interface.TeardownMode.ABORT: + self.logger.info( + "Attempting to send SIGABRT from resmoke to mongod on port %d with pid %d...", + self.port, self.mongod.pid) + else: + self.logger.info("Stopping mongod on port %d with pid %d...", self.port, + self.mongod.pid) if not self.is_running(): exit_code = self.mongod.poll() msg = ("mongod on port {:d} was expected to be running, but wasn't. " @@ -124,12 +130,12 @@ class MongoDFixture(interface.Fixture): self.logger.warning(msg) raise errors.ServerFailure(msg) - self.mongod.stop(kill) + self.mongod.stop(mode) exit_code = self.mongod.wait() - # SIGKILL has an exit code of 9 and Python's subprocess module returns - # negative versions of system calls. - if exit_code == 0 or (exit_code == -9 and kill): + # Python's subprocess module returns negative versions of system calls. + # pylint: disable=invalid-unary-operand-type + if exit_code == 0 or (mode is not None and exit_code == -(mode.value)): self.logger.info("Successfully stopped the mongod on port {:d}.".format(self.port)) else: self.logger.warning("Stopped the mongod on port {:d}. " diff --git a/buildscripts/resmokelib/testing/fixtures/yesfixture.py b/buildscripts/resmokelib/testing/fixtures/yesfixture.py index 310bd8c34b1..7eac09f58c3 100644 --- a/buildscripts/resmokelib/testing/fixtures/yesfixture.py +++ b/buildscripts/resmokelib/testing/fixtures/yesfixture.py @@ -35,7 +35,7 @@ class YesFixture(interface.Fixture): # pylint: disable=abstract-method logger = self.logger.new_fixture_node_logger("yes{:d}".format(index)) return programs.generic_program(logger, ["yes", self.__message]) - def _do_teardown(self, kill=False): + def _do_teardown(self, mode=None): running_at_start = self.is_running() success = True # Still a success even if nothing is running. @@ -49,7 +49,7 @@ class YesFixture(interface.Fixture): # pylint: disable=abstract-method if process is not None: if running_at_start: self.logger.info("Stopping yes process with pid %d...", process.pid) - process.stop(kill) + process.stop(mode) exit_code = process.wait() success = (exit_code == -signal.SIGTERM) and success diff --git a/buildscripts/resmokelib/testing/hook_test_archival.py b/buildscripts/resmokelib/testing/hook_test_archival.py index 958604faf20..4a643f541b3 100644 --- a/buildscripts/resmokelib/testing/hook_test_archival.py +++ b/buildscripts/resmokelib/testing/hook_test_archival.py @@ -103,8 +103,8 @@ class HookTestArchival(object): """Trigger archive of data files for a test or hook.""" # We can still attempt archiving even if the teardown fails. - if not manager.teardown_fixture(logger, kill=True): - logger.warning("Error while killing test fixtures; data files may be invalid.") + if not manager.teardown_fixture(logger, abort=True): + logger.warning("Error while aborting test fixtures; data files may be invalid.") with self._lock: # Test repeat number is how many times the particular test has been archived. if test_name not in self._tests_repeat: diff --git a/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py b/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py index 3230d5f23de..1136d69806e 100644 --- a/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py +++ b/buildscripts/resmokelib/testing/hooks/periodic_kill_secondaries.py @@ -169,7 +169,7 @@ class PeriodicKillSecondariesTestCase(interface.DynamicTestCase): " PeriodicKillSecondaries.after_test(), but wasn't.".format(secondary.port)) self.logger.info("Killing the secondary on port %d...", secondary.port) - secondary.mongod.stop(kill=True) + secondary.mongod.stop(mode=fixture.TeardownMode.KILL) try: self.fixture.teardown() diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py index 3cf2aa5b9ab..de2c89e9a38 100644 --- a/buildscripts/resmokelib/testing/hooks/stepdown.py +++ b/buildscripts/resmokelib/testing/hooks/stepdown.py @@ -14,6 +14,7 @@ from buildscripts.resmokelib import utils from buildscripts.resmokelib.testing.hooks import interface from buildscripts.resmokelib.testing.fixtures import replicaset from buildscripts.resmokelib.testing.fixtures import shardedcluster +from buildscripts.resmokelib.testing.fixtures import interface as fixture_interface class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-attributes @@ -481,7 +482,8 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at # We send the mongod process the signal to exit but don't immediately wait for it to # exit because clean shutdown may take a while and we want to restore write availability # as quickly as possible. - primary.mongod.stop(kill=should_kill) + teardown_mode = fixture_interface.TeardownMode.KILL if should_kill else fixture_interface.TeardownMode.TERMINATE + primary.mongod.stop(mode=teardown_mode) elif not self._stepdown_via_heartbeats: self.logger.info("Stepping down the primary on port %d of replica set '%s'.", primary.port, rs_fixture.replset_name) diff --git a/buildscripts/resmokelib/testing/job.py b/buildscripts/resmokelib/testing/job.py index 1d7d76ddfdc..8ed225e84c5 100644 --- a/buildscripts/resmokelib/testing/job.py +++ b/buildscripts/resmokelib/testing/job.py @@ -332,16 +332,16 @@ class FixtureTestCaseManager: return True - def teardown_fixture(self, logger, kill=False): + def teardown_fixture(self, logger, abort=False): """ Run a test that tears down the job's fixture. Return True if the teardown was successful, False otherwise. """ - if kill: - test_case = _fixture.FixtureKillTestCase(self.test_queue_logger, self.fixture, - "job{}".format(self.job_num), - self.times_set_up) + if abort: + test_case = _fixture.FixtureAbortTestCase(self.test_queue_logger, self.fixture, + "job{}".format(self.job_num), + self.times_set_up) self.times_set_up += 1 else: test_case = _fixture.FixtureTeardownTestCase(self.test_queue_logger, self.fixture, diff --git a/buildscripts/resmokelib/testing/report.py b/buildscripts/resmokelib/testing/report.py index 9522a2a6df5..49709a063cc 100644 --- a/buildscripts/resmokelib/testing/report.py +++ b/buildscripts/resmokelib/testing/report.py @@ -374,7 +374,9 @@ def test_order(test_name): return 1 elif 'fixture_teardown' in test_name: return 2 - elif ':' in test_name: + elif 'fixture_abort' in test_name: return 3 - else: + elif ':' in test_name: return 4 + else: + return 5 diff --git a/buildscripts/resmokelib/testing/testcases/cpp_libfuzzer_test.py b/buildscripts/resmokelib/testing/testcases/cpp_libfuzzer_test.py index b624e92b0cc..d3b9bec01c1 100644 --- a/buildscripts/resmokelib/testing/testcases/cpp_libfuzzer_test.py +++ b/buildscripts/resmokelib/testing/testcases/cpp_libfuzzer_test.py @@ -42,7 +42,7 @@ class CPPLibfuzzerTestCase(interface.ProcessTestCase): self.return_code = process.wait(self.DEFAULT_TIMEOUT.total_seconds()) except subprocess.TimeoutExpired: # If the test timeout, then no errors were detected. Thus, the return code should be 0. - process.stop(kill=True) + process.stop(mode=interface.TerminationMode.KILL) process.wait() self.logger.info("%s timed out. No errors were found.", self.short_description()) self.return_code = 0 diff --git a/buildscripts/resmokelib/testing/testcases/fixture.py b/buildscripts/resmokelib/testing/testcases/fixture.py index 346db4f3be3..4cd6a6a2053 100644 --- a/buildscripts/resmokelib/testing/testcases/fixture.py +++ b/buildscripts/resmokelib/testing/testcases/fixture.py @@ -1,7 +1,9 @@ """The unittest.TestCase instances for setting up and tearing down fixtures.""" + from buildscripts.resmokelib import errors from buildscripts.resmokelib.testing.testcases import interface from buildscripts.resmokelib.utils import registry +from buildscripts.resmokelib.testing.fixtures import interface as fixture_interface class FixtureTestCase(interface.TestCase): # pylint: disable=abstract-method @@ -74,14 +76,14 @@ class FixtureTeardownTestCase(FixtureTestCase): raise -class FixtureKillTestCase(FixtureTestCase): +class FixtureAbortTestCase(FixtureTestCase): """TestCase for killing a fixture. Intended for use before archiving a failed test.""" REGISTERED_NAME = registry.LEAVE_UNREGISTERED - PHASE = "kill" + PHASE = "abort" def __init__(self, logger, fixture, job_name, times_set_up): - """Initialize the FixtureKillTestCase.""" + """Initialize the FixtureAbortTestCase.""" specific_phase = "{phase}_{times_set_up}".format(phase=self.PHASE, times_set_up=times_set_up) FixtureTestCase.__init__(self, logger, job_name, specific_phase) @@ -91,10 +93,14 @@ class FixtureKillTestCase(FixtureTestCase): """Tear down the fixture.""" try: self.return_code = 2 # Test return code of 2 is used for fixture failures. - self.logger.info("Killing the fixture %s.", self.fixture) - self.fixture.teardown(finished=False, kill=True) - self.logger.info("Finished killing %s.", self.fixture) + self.logger.info("Aborting the fixture %s due to test failure.", self.fixture) + self.fixture.teardown(finished=False, mode=fixture_interface.TeardownMode.ABORT) + self.logger.info("Finished aborting %s.", self.fixture) + self.return_code = 0 + except errors.ServerFailure: + # If the server wasn't already running, we can't exactly fail to abort it. + self.logger.info("Finished aborting %s.", self.fixture) self.return_code = 0 except: - self.logger.exception("An error occurred while killing %s.", self.fixture) + self.logger.exception("An error occurred while aborting %s.", self.fixture) raise diff --git a/buildscripts/tests/resmokelib/testing/fixtures/test_interface.py b/buildscripts/tests/resmokelib/testing/fixtures/test_interface.py index e7685ece4c4..5d6ff38bd54 100644 --- a/buildscripts/tests/resmokelib/testing/fixtures/test_interface.py +++ b/buildscripts/tests/resmokelib/testing/fixtures/test_interface.py @@ -51,6 +51,6 @@ class UnitTestFixture(interface.Fixture): # pylint: disable=abstract-method interface.Fixture.__init__(self, logger, 99) self._should_raise = should_raise - def _do_teardown(self, kill=False): + def _do_teardown(self, mode=None): if self._should_raise: raise errors.ServerFailure(self.ERROR_MESSAGE)