Files
mongo/buildscripts/resmokelib/testing/fixtures/shardedcluster.py
2022-05-01 08:29:11 +00:00

590 lines
27 KiB
Python

"""Sharded cluster fixture for executing JSTests against."""
import os.path
import time
import yaml
import pymongo
import pymongo.errors
import buildscripts.resmokelib.testing.fixtures.interface as interface
class ShardedClusterFixture(interface.Fixture): # pylint: disable=too-many-instance-attributes
"""Fixture which provides JSTests with a sharded cluster to run against."""
_CONFIGSVR_REPLSET_NAME = "config-rs"
_SHARD_REPLSET_NAME_PREFIX = "shard-rs"
AWAIT_SHARDING_INITIALIZATION_TIMEOUT_SECS = 60
def __init__( # pylint: disable=too-many-arguments,too-many-locals
self, logger, job_num, fixturelib, mongos_executable=None, mongos_options=None,
mongod_executable=None, mongod_options=None, dbpath_prefix=None, preserve_dbpath=False,
num_shards=1, num_rs_nodes_per_shard=1, num_mongos=1, enable_sharding=None,
enable_balancer=True, enable_autosplit=True, auth_options=None, configsvr_options=None,
shard_options=None, cluster_logging_prefix=None):
"""Initialize ShardedClusterFixture with different options for the cluster processes."""
interface.Fixture.__init__(self, logger, job_num, fixturelib, dbpath_prefix=dbpath_prefix)
if "dbpath" in mongod_options:
raise ValueError("Cannot specify mongod_options.dbpath")
self.mongos_executable = mongos_executable
self.mongos_options = self.fixturelib.make_historic(
self.fixturelib.default_if_none(mongos_options, {}))
self.mongod_options = self.fixturelib.make_historic(
self.fixturelib.default_if_none(mongod_options, {}))
self.mongod_executable = mongod_executable
self.mongod_options["set_parameters"] = self.fixturelib.make_historic(
mongod_options.get("set_parameters", {})).copy()
self.mongod_options["set_parameters"]["migrationLockAcquisitionMaxWaitMS"] = \
self.mongod_options["set_parameters"].get("migrationLockAcquisitionMaxWaitMS", 30000)
self.preserve_dbpath = preserve_dbpath
self.num_shards = num_shards
self.num_rs_nodes_per_shard = num_rs_nodes_per_shard
self.num_mongos = num_mongos
self.enable_sharding = self.fixturelib.default_if_none(enable_sharding, [])
self.enable_balancer = enable_balancer
self.enable_autosplit = enable_autosplit
self.auth_options = auth_options
self.configsvr_options = self.fixturelib.make_historic(
self.fixturelib.default_if_none(configsvr_options, {}))
self.shard_options = self.fixturelib.make_historic(
self.fixturelib.default_if_none(shard_options, {}))
# The logging prefix used in cluster to cluster replication.
self.cluster_logging_prefix = "" if cluster_logging_prefix is None else f"{cluster_logging_prefix}:"
self.configsvr_shard_logging_prefix = f"{self.cluster_logging_prefix}configsvr"
self.rs_shard_logging_prefix = f"{self.cluster_logging_prefix}shard"
self.mongos_logging_prefix = f"{self.cluster_logging_prefix}mongos"
if self.num_rs_nodes_per_shard is None:
raise TypeError("num_rs_nodes_per_shard must be an integer but found None")
elif isinstance(self.num_rs_nodes_per_shard, int):
if self.num_rs_nodes_per_shard <= 0:
raise ValueError("num_rs_nodes_per_shard must be a positive integer")
self._dbpath_prefix = os.path.join(self._dbpath_prefix, self.config.FIXTURE_SUBDIR)
self.configsvr = None
self.mongos = []
self.shards = []
def pids(self):
""":return: pids owned by this fixture if any."""
out = []
if self.configsvr is not None:
out.extend(self.configsvr.pids())
else:
self.logger.debug(
'Config server not running when gathering sharded cluster fixture pids.')
if self.shards is not None:
for shard in self.shards:
out.extend(shard.pids())
else:
self.logger.debug('No shards when gathering sharded cluster fixture pids.')
return out
def setup(self):
"""Set up the sharded cluster."""
self.configsvr.setup()
# Start up each of the shards
for shard in self.shards:
shard.setup()
def refresh_logical_session_cache(self, target):
"""Refresh logical session cache with no timeout."""
primary = target.get_primary().mongo_client()
try:
primary.admin.command({"refreshLogicalSessionCacheNow": 1})
except pymongo.errors.OperationFailure as err:
if err.code != self._WRITE_CONCERN_FAILED:
raise err
self.logger.info("Ignoring write concern timeout for refreshLogicalSessionCacheNow "
"command and continuing to wait")
target.await_last_op_committed(target.AWAIT_REPL_TIMEOUT_FOREVER_MINS * 60)
def get_shard_ids(self):
"""Get the list of shard ids in the cluster."""
client = self.mongo_client()
interface.authenticate(client, self.auth_options)
res = client.admin.command("listShards")
return [shard_info["_id"] for shard_info in res["shards"]]
def await_ready(self):
"""Block until the fixture can be used for testing."""
# Wait for the config server
if self.configsvr is not None:
self.configsvr.await_ready()
# Wait for each of the shards
for shard in self.shards:
shard.await_ready()
# We call mongos.setup() in self.await_ready() function instead of self.setup()
# because mongos routers have to connect to a running cluster.
for mongos in self.mongos:
# Start up the mongos.
mongos.setup()
# Wait for the mongos.
mongos.await_ready()
client = self.mongo_client()
interface.authenticate(client, self.auth_options)
# Turn off the balancer if it is not meant to be enabled.
if not self.enable_balancer:
self.stop_balancer()
# Turn off autosplit if it is not meant to be enabled.
if not self.enable_autosplit:
wc = pymongo.WriteConcern(w="majority", wtimeout=30000)
coll = client.config.get_collection("settings", write_concern=wc)
coll.update_one({"_id": "autosplit"}, {"$set": {"enabled": False}}, upsert=True)
# Inform mongos about each of the shards
for shard in self.shards:
self._add_shard(client, shard)
# Ensure that all CSRS nodes are up to date. This is strictly needed for tests that use
# multiple mongoses. In those cases, the first mongos initializes the contents of the config
# database, but without waiting for those writes to replicate to all the config servers then
# the secondary mongoses risk reading from a stale config server and seeing an empty config
# database.
self.configsvr.await_last_op_committed()
# Enable sharding on each of the specified databases
for db_name in self.enable_sharding:
self.logger.info("Enabling sharding for '%s' database...", db_name)
client.admin.command({"enablesharding": db_name})
# Wait for mongod's to be ready.
self._await_mongod_sharding_initialization()
# Ensure that the sessions collection gets auto-sharded by the config server
if self.configsvr is not None:
self.refresh_logical_session_cache(self.configsvr)
for shard in self.shards:
self.refresh_logical_session_cache(shard)
def _await_mongod_sharding_initialization(self):
if (self.enable_sharding) and (self.num_rs_nodes_per_shard is not None):
deadline = time.time(
) + ShardedClusterFixture.AWAIT_SHARDING_INITIALIZATION_TIMEOUT_SECS
timeout_occurred = lambda: deadline - time.time() <= 0.0
mongod_clients = [(mongod.mongo_client(), mongod.port) for shard in self.shards
for mongod in shard.nodes]
for client, port in mongod_clients:
interface.authenticate(client, self.auth_options)
while True:
# The choice of namespace (local.fooCollection) does not affect the output.
get_shard_version_result = client.admin.command(
"getShardVersion", "local.fooCollection", check=False)
if get_shard_version_result["ok"]:
break
if timeout_occurred():
raise self.fixturelib.ServerFailure(
"mongod on port: {} failed waiting for getShardVersion success after {} seconds"
.format(port, interface.Fixture.AWAIT_READY_TIMEOUT_SECS))
time.sleep(0.1)
def stop_balancer(self, timeout_ms=60000):
"""Stop the balancer."""
client = self.mongo_client()
interface.authenticate(client, self.auth_options)
client.admin.command({"balancerStop": 1}, maxTimeMS=timeout_ms)
self.logger.info("Stopped the balancer")
def start_balancer(self, timeout_ms=60000):
"""Start the balancer."""
client = self.mongo_client()
interface.authenticate(client, self.auth_options)
client.admin.command({"balancerStart": 1}, maxTimeMS=timeout_ms)
self.logger.info("Started the balancer")
def _do_teardown(self, mode=None):
"""Shut down the sharded cluster."""
self.logger.info("Stopping all members of the sharded cluster...")
running_at_start = self.is_running()
if not running_at_start:
self.logger.warning("All members of the sharded cluster were expected to be running, "
"but weren't.")
# If we're killing or aborting to archive data files, stopping the balancer will execute
# server commands that might lead to on-disk changes from the point of failure.
if self.enable_balancer and mode not in (interface.TeardownMode.KILL,
interface.TeardownMode.ABORT):
self.stop_balancer()
teardown_handler = interface.FixtureTeardownHandler(self.logger)
for mongos in self.mongos:
teardown_handler.teardown(mongos, "mongos", mode=mode)
for shard in self.shards:
teardown_handler.teardown(shard, "shard", mode=mode)
if self.configsvr is not None:
teardown_handler.teardown(self.configsvr, "config server", mode=mode)
if teardown_handler.was_successful():
self.logger.info("Successfully stopped all members of the sharded cluster.")
else:
self.logger.error("Stopping the sharded cluster fixture failed.")
raise self.fixturelib.ServerFailure(teardown_handler.get_error_message())
def is_running(self):
"""Return true if all nodes in the cluster are all still operating."""
return (self.configsvr is not None and self.configsvr.is_running()
and all(shard.is_running() for shard in self.shards)
and all(mongos.is_running() for mongos in self.mongos))
def get_internal_connection_string(self):
"""Return the internal connection string."""
if self.mongos is None:
raise ValueError("Must call setup() before calling get_internal_connection_string()")
return ",".join([mongos.get_internal_connection_string() for mongos in self.mongos])
def get_driver_connection_url(self):
"""Return the driver connection URL."""
return "mongodb://" + self.get_internal_connection_string()
def get_node_info(self):
"""Return a list of dicts of NodeInfo objects."""
output = []
for shard in self.shards:
output += shard.get_node_info()
for mongos in self.mongos:
output += mongos.get_node_info()
return output + self.configsvr.get_node_info()
def get_configsvr_logger(self):
"""Return a new logging.Logger instance used for a config server shard."""
return self.fixturelib.new_fixture_node_logger(self.__class__.__name__, self.job_num,
self.configsvr_shard_logging_prefix)
def get_configsvr_kwargs(self):
"""Return args to create replicaset.ReplicaSetFixture configured as the config server."""
configsvr_options = self.configsvr_options.copy()
auth_options = configsvr_options.pop("auth_options", self.auth_options)
preserve_dbpath = configsvr_options.pop("preserve_dbpath", self.preserve_dbpath)
num_nodes = configsvr_options.pop("num_nodes", 1)
replset_config_options = configsvr_options.pop("replset_config_options", {})
replset_config_options["configsvr"] = True
mongod_options = self.mongod_options.copy()
mongod_options = self.fixturelib.merge_mongo_option_dicts(
mongod_options,
self.fixturelib.make_historic(configsvr_options.pop("mongod_options", {})))
mongod_options["configsvr"] = ""
mongod_options["dbpath"] = os.path.join(self._dbpath_prefix, "config")
mongod_options["replSet"] = ShardedClusterFixture._CONFIGSVR_REPLSET_NAME
mongod_options["storageEngine"] = "wiredTiger"
return {
"mongod_options": mongod_options, "mongod_executable": self.mongod_executable,
"preserve_dbpath": preserve_dbpath, "num_nodes": num_nodes,
"auth_options": auth_options, "replset_config_options": replset_config_options,
"shard_logging_prefix": self.configsvr_shard_logging_prefix, **configsvr_options
}
def install_configsvr(self, configsvr):
"""Install a configsvr. Called by a builder."""
self.configsvr = configsvr
def _get_rs_shard_logging_prefix(self, index):
"""Return replica set shard logging prefix."""
return f"{self.rs_shard_logging_prefix}{index}"
def get_rs_shard_logger(self, index):
"""Return a new logging.Logger instance used for a replica set shard."""
shard_logging_prefix = self._get_rs_shard_logging_prefix(index)
return self.fixturelib.new_fixture_node_logger(self.__class__.__name__, self.job_num,
shard_logging_prefix)
def get_rs_shard_kwargs(self, index):
"""Return args to create replicaset.ReplicaSetFixture configured as a shard in a sharded cluster."""
shard_options = self.shard_options.copy()
auth_options = shard_options.pop("auth_options", self.auth_options)
preserve_dbpath = shard_options.pop("preserve_dbpath", self.preserve_dbpath)
replset_config_options = self.fixturelib.make_historic(
shard_options.pop("replset_config_options", {}))
replset_config_options["configsvr"] = False
mongod_options = self.mongod_options.copy()
mongod_options = self.fixturelib.merge_mongo_option_dicts(
mongod_options, self.fixturelib.make_historic(shard_options.pop("mongod_options", {})))
mongod_options["shardsvr"] = ""
mongod_options["dbpath"] = os.path.join(self._dbpath_prefix, "shard{}".format(index))
mongod_options["replSet"] = self._SHARD_REPLSET_NAME_PREFIX + str(index)
shard_logging_prefix = self._get_rs_shard_logging_prefix(index)
return {
"mongod_options": mongod_options, "mongod_executable": self.mongod_executable,
"auth_options": auth_options, "preserve_dbpath": preserve_dbpath,
"replset_config_options": replset_config_options,
"shard_logging_prefix": shard_logging_prefix, **shard_options
}
def install_rs_shard(self, rs_shard):
"""Install a replica set shard. Called by a builder."""
self.shards.append(rs_shard)
def get_mongos_logger(self, index, total):
"""Return a new logging.Logger instance used for a mongos."""
logger_name = self.mongos_logging_prefix if total == 1 else f"{self.mongos_logging_prefix}{index}"
return self.fixturelib.new_fixture_node_logger(self.__class__.__name__, self.job_num,
logger_name)
def get_mongos_kwargs(self):
"""Return options that may be passed to a mongos."""
mongos_options = self.mongos_options.copy()
mongos_options["configdb"] = self.configsvr.get_internal_connection_string()
mongos_options["set_parameters"] = mongos_options.get("set_parameters",
self.fixturelib.make_historic(
{})).copy()
return {"dbpath_prefix": self._dbpath_prefix, "mongos_options": mongos_options}
def install_mongos(self, mongos):
"""Install a mongos. Called by a builder."""
self.mongos.append(mongos)
def _add_shard(self, client, shard):
"""
Add the specified program as a shard by executing the addShard command.
See https://docs.mongodb.org/manual/reference/command/addShard for more details.
"""
connection_string = shard.get_internal_connection_string()
self.logger.info("Adding %s as a shard...", connection_string)
client.admin.command({"addShard": connection_string})
class _MongoSFixture(interface.Fixture):
"""Fixture which provides JSTests with a mongos to connect to."""
# pylint: disable=too-many-arguments
def __init__(self, logger, job_num, fixturelib, dbpath_prefix, mongos_executable=None,
mongos_options=None, add_feature_flags=False):
"""Initialize _MongoSFixture."""
interface.Fixture.__init__(self, logger, job_num, fixturelib)
self.fixturelib = fixturelib
self.config = self.fixturelib.get_config()
# Default to command line options if the YAML configuration is not passed in.
self.mongos_executable = self.fixturelib.default_if_none(mongos_executable,
self.config.MONGOS_EXECUTABLE)
self.mongos_options = self.fixturelib.make_historic(
self.fixturelib.default_if_none(mongos_options, {})).copy()
if add_feature_flags:
for ff in self.config.ENABLED_FEATURE_FLAGS:
self.mongos_options["set_parameters"][ff] = "true"
self.mongos = None
self.port = fixturelib.get_next_port(job_num)
self.mongos_options["port"] = self.port
self._dbpath_prefix = dbpath_prefix
def setup(self):
"""Set up the sharded cluster."""
if self.config.ALWAYS_USE_LOG_FILES:
self.mongos_options["logpath"] = self._dbpath_prefix + "/{name}.log".format(
name=self.logger.name)
self.mongos_options["logappend"] = ""
launcher = MongosLauncher(self.fixturelib)
mongos, _ = launcher.launch_mongos_program(self.logger, self.job_num,
executable=self.mongos_executable,
mongos_options=self.mongos_options)
self.mongos_options["port"] = self.port
try:
self.logger.info("Starting mongos on port %d...\n%s", self.port, mongos.as_command())
mongos.start()
self.logger.info("mongos started on port %d with pid %d.", self.port, mongos.pid)
except Exception as err:
msg = "Failed to start mongos on port {:d}: {}".format(self.port, err)
self.logger.exception(msg)
raise self.fixturelib.ServerFailure(msg)
self.mongos = mongos
def pids(self):
""":return: pids owned by this fixture if any."""
if self.mongos is not None:
return [self.mongos.pid]
else:
self.logger.debug('Mongos not running when gathering mongos fixture pids.')
return []
def await_ready(self):
"""Block until the fixture can be used for testing."""
deadline = time.time() + interface.Fixture.AWAIT_READY_TIMEOUT_SECS
# Wait until the mongos is accepting connections. The retry logic is necessary to support
# versions of PyMongo <3.0 that immediately raise a ConnectionFailure if a connection cannot
# be established.
while True:
# Check whether the mongos exited for some reason.
exit_code = self.mongos.poll()
if exit_code is not None:
raise self.fixturelib.ServerFailure(
"Could not connect to mongos on port {}, process ended"
" unexpectedly with code {}.".format(self.port, exit_code))
try:
# Use a shorter connection timeout to more closely satisfy the requested deadline.
client = self.mongo_client(timeout_millis=500)
client.admin.command("ping")
break
except pymongo.errors.ConnectionFailure:
remaining = deadline - time.time()
if remaining <= 0.0:
raise self.fixturelib.ServerFailure(
"Failed to connect to mongos on port {} after {} seconds".format(
self.port, interface.Fixture.AWAIT_READY_TIMEOUT_SECS))
self.logger.info("Waiting to connect to mongos on port %d.", self.port)
time.sleep(0.1) # Wait a little bit before trying again.
self.logger.info("Successfully contacted the mongos on port %d.", self.port)
def _do_teardown(self, mode=None):
if self.mongos is None:
self.logger.warning("The mongos fixture has not been set up yet.")
return # Teardown is still a success even if nothing is running.
if mode == interface.TeardownMode.ABORT:
self.logger.info(
"Attempting to send SIGABRT from resmoke to mongos on port %d with pid %d...",
self.port, self.mongos.pid)
else:
self.logger.info("Stopping mongos on port %d with pid %d...", self.port,
self.mongos.pid)
if not self.is_running():
exit_code = self.mongos.poll()
msg = ("mongos on port {:d} was expected to be running, but wasn't. "
"Process exited with code {:d}").format(self.port, exit_code)
self.logger.warning(msg)
raise self.fixturelib.ServerFailure(msg)
self.mongos.stop(mode=mode)
exit_code = self.mongos.wait()
# Python's subprocess module returns negative versions of system calls.
# pylint: disable=invalid-unary-operand-type
if exit_code == 0 or (mode is not None and exit_code == -(mode.value)):
self.logger.info("Successfully stopped the mongos on port {:d}".format(self.port))
else:
self.logger.warning("Stopped the mongos on port {:d}. "
"Process exited with code {:d}.".format(self.port, exit_code))
raise self.fixturelib.ServerFailure(
"mongos on port {:d} with pid {:d} exited with code {:d}".format(
self.port, self.mongos.pid, exit_code))
def is_running(self):
"""Return true if the cluster is still operating."""
return self.mongos is not None and self.mongos.poll() is None
def get_internal_connection_string(self):
"""Return the internal connection string."""
return "localhost:%d" % self.port
def get_driver_connection_url(self):
"""Return the driver connection URL."""
return "mongodb://" + self.get_internal_connection_string()
def get_node_info(self):
"""Return a list of NodeInfo objects."""
info = interface.NodeInfo(full_name=self.logger.full_name, name=self.logger.name,
port=self.port, pid=self.mongos.pid)
return [info]
# Default shutdown quiesce mode duration for mongos
DEFAULT_MONGOS_SHUTDOWN_TIMEOUT_MILLIS = 0
# The default verbosity setting for any tests that are not started with an Evergreen task id. This
# will apply to any tests run locally.
DEFAULT_MONGOS_LOG_COMPONENT_VERBOSITY = {"transaction": 3}
# The default verbosity setting for any tests running in Evergreen i.e. started with an Evergreen
# task id.
DEFAULT_EVERGREEN_MONGOS_LOG_COMPONENT_VERBOSITY = {"transaction": 3}
class MongosLauncher(object):
"""Class with utilities for launching a mongos."""
def __init__(self, fixturelib):
"""Initialize MongosLauncher."""
self.fixturelib = fixturelib
self.config = fixturelib.get_config()
def default_mongos_log_component_verbosity(self):
"""Return the default 'logComponentVerbosity' value to use for mongos processes."""
if self.config.EVERGREEN_TASK_ID:
return DEFAULT_EVERGREEN_MONGOS_LOG_COMPONENT_VERBOSITY
return DEFAULT_MONGOS_LOG_COMPONENT_VERBOSITY
def launch_mongos_program(self, logger, job_num, executable=None, process_kwargs=None,
mongos_options=None):
"""Return a Process instance that starts a mongos with arguments constructed from 'kwargs'."""
executable = self.fixturelib.default_if_none(executable,
self.config.DEFAULT_MONGOS_EXECUTABLE)
# Apply the --setParameter command line argument. Command line options to resmoke.py override
# the YAML configuration.
suite_set_parameters = mongos_options.setdefault("set_parameters", {})
if self.config.MONGOS_SET_PARAMETERS is not None:
suite_set_parameters.update(yaml.safe_load(self.config.MONGOS_SET_PARAMETERS))
# Set default log verbosity levels if none were specified.
if "logComponentVerbosity" not in suite_set_parameters:
suite_set_parameters[
"logComponentVerbosity"] = self.default_mongos_log_component_verbosity()
# Set default shutdown timeout millis if none was specified.
if "mongosShutdownTimeoutMillisForSignaledShutdown" not in suite_set_parameters:
suite_set_parameters[
"mongosShutdownTimeoutMillisForSignaledShutdown"] = DEFAULT_MONGOS_SHUTDOWN_TIMEOUT_MILLIS
_add_testing_set_parameters(suite_set_parameters)
return self.fixturelib.mongos_program(logger, job_num, executable, process_kwargs,
mongos_options)
def _add_testing_set_parameters(suite_set_parameters):
"""
Add certain behaviors should only be enabled for resmoke usage.
These are traditionally enable new commands, insecure access, and increased diagnostics.
"""
suite_set_parameters.setdefault("testingDiagnosticsEnabled", True)
suite_set_parameters.setdefault("enableTestCommands", True)