Files
mongo/buildscripts/resmokelib/testing/hooks/simulate_crash.py

198 lines
7.0 KiB
Python

"""Test hook for simulating a node crash.
This hook will periodically send a SIGSTOP signal to the replica set nodes and copy their data
files. After copying the data files, the SIGCONT signal will be sent to the replica set nodes to
continue normal operation.
A standalone node will be started on the copied data files. These data files will be treated as an
unclean shutdown. Once started, validate will be run on all collections. A validate failure
indicates a problem.
"""
import os
import pprint
import random
import shutil
import threading
import time
import pymongo
from buildscripts.resmokelib.core import process
from buildscripts.resmokelib.testing.hooks import interface
def validate(mdb, logger):
"""Return true if all collections are valid."""
for db in mdb.database_names():
for coll in mdb.get_database(db).list_collection_names():
res = mdb.get_database(db).command({"validate": coll}, check=False)
if res["ok"] != 1.0 or res["valid"] is not True:
if "code" in res and res["code"] == 166:
# Command not supported on view.
pass
else:
logger.info("FAILURE!\nValidate Response: {}", pprint.pformat(res))
return False
return True
class BGJob(threading.Thread):
"""Background job to pause nodes, copy data files, resume nodes, and validate data files."""
def __init__(self, hook):
"""Initialize the background job."""
threading.Thread.__init__(self, name="SimulateCrashes")
self.daemon = True
self._hook = hook
self._lock = threading.Lock()
self._is_alive = True
self.backup_num = 0
self.found_error = False
def run(self):
"""Run the background job."""
while True:
with self._lock:
if self.is_alive is False:
break
self._hook.pause_and_copy(self.backup_num)
if not self._hook.validate_all(self.backup_num):
self.found_error = True
self._hook.running_test.fixture.teardown()
self.is_alive = False
return
time.sleep(random.randint(1, 5))
self.backup_num += 1
def kill(self):
"""Kill the background job."""
with self._lock:
self.is_alive = False
class SimulateCrash(interface.Hook):
"""A hook to simulate crashes."""
IS_BACKGROUND = True
def __init__(self, hook_logger, fixture):
"""Initialize SimulateCrash."""
interface.Hook.__init__(self, hook_logger, fixture, "Simulate crashes hook")
self.found_error = False
self.last_validate_port = 19000
self.logger = hook_logger
self.running_test = None
self._background_job = None
def pause_and_copy(self, backup_num):
"""For all replica set nodes, this will send a SIGSTOP signal, copy the data files and send a SIGCONT signal."""
self.logger.info("Taking snapshot #{}".format(backup_num))
nodes_to_copy = [x for x in self.fixture.nodes]
random.shuffle(nodes_to_copy)
for node in nodes_to_copy:
node.mongod.pause()
self.logger.info("Starting to copy data files. DBPath: {}".format(
node.get_dbpath_prefix()))
try:
for tup in os.walk(node.get_dbpath_prefix(), followlinks=True):
if tup[0].endswith("/diagnostic.data") or tup[0].endswith("/_tmp"):
continue
if "/simulateCrashes" in tup[0]:
continue
for filename in tup[-1]:
if "Preplog" in filename:
continue
fqfn = "/".join([tup[0], filename])
self.copy_file(
node.get_dbpath_prefix(), fqfn,
node.get_dbpath_prefix() + "/simulateCrashes/{}".format(backup_num))
finally:
node.mongod.resume()
@classmethod
def copy_file(cls, root, fqfn, new_root):
"""Copy a file."""
in_fd = os.open(fqfn, os.O_RDONLY)
in_bytes = os.stat(in_fd).st_size
rel = fqfn[len(root):]
os.makedirs(new_root + "/journal", exist_ok=True)
out_fd = os.open(new_root + rel, os.O_WRONLY | os.O_CREAT)
os.sendfile(out_fd, in_fd, 0, in_bytes)
os.close(out_fd)
os.close(in_fd)
def validate_all(self, backup_num):
"""Start a standalone node to validate all collections on the copied data files."""
for node in self.fixture.nodes:
if self.last_validate_port >= 20000:
self.last_validate_port = 19000
validate_port = self.last_validate_port
self.last_validate_port += 1
path = node.get_dbpath_prefix() + "/simulateCrashes/{}".format(backup_num)
self.logger.info("Starting to validate. DBPath: {} Port: {}".format(
path, validate_port))
mdb = process.Process(self.logger, [
node.mongod_executable, "--dbpath", path, "--port",
str(validate_port), "--logpath",
node.get_dbpath_prefix() + "/simulateCrashes/validate.log"
])
mdb.start()
client = pymongo.MongoClient("localhost:{}".format(validate_port))
is_valid = validate(client, self.logger)
mdb.stop()
mdb.wait()
if not is_valid:
return False
shutil.rmtree(path, ignore_errors=True)
return True
def before_suite(self, test_report):
"""Start the background thread."""
self.logger.info("Starting the SimulateCrashes thread.")
self._background_job = BGJob(self)
self._background_job.start()
def after_suite(self, test_report, teardown_flag=None):
"""Signal the background thread to exit, and wait until it does."""
if self._background_job is None:
return
self.logger.info("Stopping the SimulateCrashes thread.")
self._background_job.kill()
self._background_job.join()
if self._background_job.found_error:
self.logger.error("Encountered an error inside the simulate crashes hook.",
exc_info=self._background_job.exc_info)
def before_test(self, test, test_report):
"""Each test will call this before it executes."""
self.running_test = test
def after_test(self, test, test_report):
"""Each test will call this after it executes. Check if the hook found an error."""
if self._background_job is None:
return
if not self._background_job.found_error:
return
self._background_job.kill()
self._background_job.join()
self.logger.error("Encountered an error inside the simulate crashes hook.",
exc_info=self._background_job.exc_info)