mongo/buildscripts/resmokelib/testing/fixtures/shard_split.py

"""Fixture for testing shard split operations."""

import time
import os.path
import threading
import shutil

import pymongo
from bson.objectid import ObjectId

import buildscripts.resmokelib.testing.fixtures.interface as interface
from buildscripts.resmokelib.testing.fixtures.fixturelib import with_naive_retry


def _is_replica_set_fixture(fixture):
    """Determine whether the passed in fixture is a ReplicaSetFixture."""
    return hasattr(fixture, 'replset_name')


def _teardown_and_clean_fixture(fixture):
    """Teardown the provided fixture, and remove its data directory."""
    # ReplicaSetFixtures in the shard split fixture share the same logger as the parent
    # ShardSplitFixture instance. We only want to explicitly close the loggers if we are tearing
    # down single recipient nodes (before they become a replica set).
    should_close_logger = not _is_replica_set_fixture(fixture)
    fixture.teardown(finished=should_close_logger)

    # Remove the data directory for the node to prevent unbounded disk space utilization.
    shutil.rmtree(fixture.get_dbpath_prefix(), ignore_errors=False)


class ShardSplitFixture(interface.MultiClusterFixture):
    """Fixture which provides JSTests with a replica set and recipient nodes to run splits against."""

    AWAIT_REPL_TIMEOUT_MINS = 5
    AWAIT_REPL_TIMEOUT_FOREVER_MINS = 24 * 60

    def __init__(
            self,
            logger,
            job_num,
            fixturelib,
            common_mongod_options=None,
            per_mongod_options=None,
            dbpath_prefix=None,
            preserve_dbpath=False,
            num_nodes_per_replica_set=2,
            auth_options=None,
            replset_config_options=None,
            mixed_bin_versions=None,
    ):
        """Initialize ShardSplitFixture with different options for the replica set processes."""
        interface.MultiClusterFixture.__init__(self, logger, job_num, fixturelib,
                                               dbpath_prefix=dbpath_prefix)
        self.__lock = threading.Lock()

        self.common_mongod_options = self.fixturelib.default_if_none(common_mongod_options, {})
        self.per_mongod_options = self.fixturelib.default_if_none(per_mongod_options, {})
        self.dbpath_prefix = dbpath_prefix
        self.preserve_dbpath = preserve_dbpath
        self.auth_options = auth_options
        self.replset_config_options = self.fixturelib.default_if_none(replset_config_options, {})
        self.mixed_bin_versions = self.fixturelib.default_if_none(mixed_bin_versions,
                                                                  self.config.MIXED_BIN_VERSIONS)
        self.num_nodes_per_replica_set = num_nodes_per_replica_set if num_nodes_per_replica_set \
            else self.config.NUM_REPLSET_NODES
        # The default shard split timeout (10 seconds) is not long enough for some test cases
        # which have slow system performances may cause the shard split operation to be long.
        if "set_parameters" not in self.common_mongod_options:
            self.common_mongod_options["set_parameters"] = {}
        if "shardSplitTimeoutMS" not in self.common_mongod_options["set_parameters"]:
            self.common_mongod_options["set_parameters"]["shardSplitTimeoutMS"] = 60000

        self.fixtures = []
        self._can_teardown_retired_donor_rs = threading.Event()
        # By default, we can always tear down the retired donor rs
        self._can_teardown_retired_donor_rs.set()

        # Make the initial donor replica set
        donor_rs_name = "rs0"
        mongod_options = self.common_mongod_options.copy()
        mongod_options["dbpath"] = os.path.join(self._dbpath_prefix, donor_rs_name)
        mongod_options["serverless"] = True

        self.fixtures.append(
            self.fixturelib.make_fixture(
                "ReplicaSetFixture", self.logger, self.job_num, mongod_options=mongod_options,
                preserve_dbpath=self.preserve_dbpath, num_nodes=self.num_nodes_per_replica_set,
                auth_options=self.auth_options, replset_config_options=self.replset_config_options,
                mixed_bin_versions=self.mixed_bin_versions, replicaset_logging_prefix=donor_rs_name,
                all_nodes_electable=True, replset_name=donor_rs_name))

        # Ensure that all nodes are only ever run on the same deterministic set of ports, this
        # makes it easier to reroute in the jstest overrides
        self._port_index = 0
        self._ports = [[node.port for node in self.get_donor_rs().nodes],
                       [
                           self.fixturelib.get_next_port(self.job_num)
                           for _ in range(self.num_nodes_per_replica_set)
                       ]]

    def pids(self):
        """:return: pids owned by this fixture if any."""
        out = []
        with self.__lock:
            for fixture in self.fixtures:
                out.extend(fixture.pids())
        if not out:
            self.logger.debug('No fixtures when gathering pids.')
        return out

    def setup(self):
        """Set up the replica sets."""
        # Don't take the lock because we don't expect setup to be called while the
        # ContinuousShardSplit hook is running, which is the only thing that can modify
        # self.fixtures. We don't want to take the lock because it would be held while starting
        # mongod instances, which is prone to hanging and could cause other functions which take
        # the lock to hang.
        for fixture in self.fixtures:
            fixture.setup()

    def await_ready(self):
        """Block until the fixture can be used for testing."""
        # Don't take the lock because we don't expect await_ready to be called while the
        # ContinuousShardSplit hook is running, which is the only thing that can modify
        # self.fixtures. We don't want to take the lock because it would be held while waiting for
        # the donor to initiate which may take a long time.
        for fixture in self.fixtures:
            fixture.await_ready()

    def _do_teardown(self, mode=None):
        """Shut down the replica sets."""
        self.logger.info("Stopping all replica sets...")

        running_at_start = self.is_running()
        if not running_at_start:
            self.logger.warning("Donor replica set expected to be running, but wasn't.")

        teardown_handler = interface.FixtureTeardownHandler(self.logger)

        # Don't take the lock because we don't expect teardown to be called while the
        # ContinuousShardSplit hook is running, which is the only thing that can modify
        # self.fixtures. Tearing down may take a long time, so taking the lock during that process
        # might result in hangs in other functions which need to take the lock.
        for fixture in reversed(self.fixtures):
            type_name = f"replica set '{fixture.replset_name}'" if _is_replica_set_fixture(
                fixture) else f"standalone on port {fixture.port}"
            teardown_handler.teardown(fixture, type_name, mode=mode)

        if teardown_handler.was_successful():
            self.logger.info("Successfully stopped donor replica set and all recipient nodes.")
        else:
            self.logger.error("Stopping the fixture failed.")
            raise self.fixturelib.ServerFailure(teardown_handler.get_error_message())

    def is_running(self):
        """Return true if all replica sets are still operating."""
        # This method is most importantly used in between test runs in job.py to determine if a
        # fixture has crashed between test invocations. We return the `is_running` status of the
        # donor here, instead of all fixtures, some of which may not have been started yet.
        return self.get_donor_rs().is_running()

    def get_internal_connection_string(self):
        """Return the internal connection string to the replica set that currently starts out owning the data."""
        donor_rs = self.get_donor_rs()
        if not donor_rs:
            raise ValueError("Must call setup() before calling get_internal_connection_string()")
        return donor_rs.get_internal_connection_string()

    def get_driver_connection_url(self):
        """Return the driver connection URL to the replica set that currently starts out owning the data."""
        donor_rs = self.get_donor_rs()
        if not donor_rs:
            raise ValueError("Must call setup() before calling get_driver_connection_url")
        return donor_rs.get_driver_connection_url()

    def get_node_info(self):
        """Return a list of dicts of NodeInfo objects."""
        output = []
        with self.__lock:
            for fixture in self.fixtures:
                output += fixture.get_node_info()
        return output

    def get_independent_clusters(self):
        """Return the replica sets involved in the tenant migration."""
        with self.__lock:
            return self.fixtures.copy()

    def get_donor_rs(self):
        """:return the donor replica set."""
        with self.__lock:
            donor_rs = next(iter(self.fixtures), None)
            if donor_rs and not _is_replica_set_fixture(donor_rs):
                raise ValueError("Invalid configuration, donor_rs is not a ReplicaSetFixture")
            return donor_rs

    def get_recipient_nodes(self):
        """:return the recipient nodes for the current split operation."""
        with self.__lock:
            return self.fixtures[1:]

    def _create_client(self, fixture, **kwargs):
        return fixture.mongo_client(username=self.auth_options["username"],
                                    password=self.auth_options["password"],
                                    authSource=self.auth_options["authenticationDatabase"],
                                    authMechanism=self.auth_options["authenticationMechanism"],
                                    uuidRepresentation='standard', **kwargs)

    def add_recipient_nodes(self, recipient_set_name, recipient_tag_name=None):
        """Build recipient nodes, and reconfig them into the donor as non-voting members."""
        recipient_tag_name = recipient_tag_name or "recipientNode"
        donor_rs_name = self.get_donor_rs().replset_name

        self.logger.info(
            f"Adding {self.num_nodes_per_replica_set} recipient nodes to donor replica set '{donor_rs_name}'."
        )

        with self.__lock:
            self._port_index ^= 1  # Toggle the set of mongod ports between index 0 and 1
            for i in range(self.num_nodes_per_replica_set):
                mongod_logger = self.fixturelib.new_fixture_node_logger(
                    "MongoDFixture", self.job_num, f"{recipient_set_name}:node{i}")

                mongod_options = self.common_mongod_options.copy()
                # Even though these nodes are not starting in a replica set, we structure their
                # files on disk as if they were already part of the new recipient set. This makes
                # logging and cleanup easier.
                mongod_options["dbpath"] = os.path.join(self._dbpath_prefix, recipient_set_name,
                                                        "node{}".format(i))
                mongod_options["set_parameters"] = mongod_options.get(
                    "set_parameters", self.fixturelib.make_historic({})).copy()
                mongod_options["serverless"] = True
                mongod_port = self._ports[self._port_index][i]

                self.fixtures.append(
                    self.fixturelib.make_fixture(
                        "MongoDFixture", mongod_logger, self.job_num, mongod_options=mongod_options,
                        dbpath_prefix=self.dbpath_prefix, preserve_dbpath=self.preserve_dbpath,
                        port=mongod_port))

        recipient_nodes = self.get_recipient_nodes()
        for recipient_node in recipient_nodes:
            recipient_node.setup()
            recipient_node.await_ready()

        # Reconfig the donor to add the recipient nodes as non-voting members
        donor_client = self._create_client(self.get_donor_rs())

        repl_config = with_naive_retry(lambda: donor_client.admin.command({"replSetGetConfig": 1})[
            "config"])
        repl_members = repl_config["members"]

        for recipient_node in recipient_nodes:
            # It is possible for the reconfig below to fail with a retryable error code like
            # 'InterruptedDueToReplStateChange'. In these cases, we need to run the reconfig
            # again, but some or all of the recipient nodes might have already been added to
            # the member list. Only add recipient nodes which have not yet been added on a
            # retry.
            recipient_host = recipient_node.get_internal_connection_string()
            recipient_entry = {
                "host": recipient_host, "votes": 0, "priority": 0, "hidden": True,
                "tags": {recipient_tag_name: str(ObjectId())}
            }
            member_exists = False
            for index, member in enumerate(repl_members):
                if member["host"] == recipient_host:
                    repl_members[index] = recipient_entry
                    member_exists = True

            if not member_exists:
                repl_members.append(recipient_entry)

        # Re-index all members from 0
        for idx, member in enumerate(repl_members):
            member["_id"] = idx

        # Prepare the new config
        repl_config["version"] = repl_config["version"] + 1
        repl_config["members"] = repl_members

        self.logger.info(
            f"Reconfiguring donor replica set to add non-voting recipient nodes: {repl_config}")
        with_naive_retry(lambda: donor_client.admin.command({
            "replSetReconfig": repl_config, "maxTimeMS": self.AWAIT_REPL_TIMEOUT_MINS * 60 * 1000
        }))

        # Wait for recipient nodes to become secondaries
        self._await_recipient_nodes()

    def _await_recipient_nodes(self, timeout_secs=None):
        """Wait for recipient nodes to become available."""
        if timeout_secs is None:
            timeout_secs = self.AWAIT_REPL_TIMEOUT_MINS * 60

        start = time.time()
        recipient_nodes = self.get_recipient_nodes()
        for recipient_node in recipient_nodes:
            recipient_client = self._create_client(recipient_node,
                                                   read_preference=pymongo.ReadPreference.SECONDARY)
            while True:
                now = time.time()
                if (now - start) >= timeout_secs:
                    msg = f"Timed out while waiting for secondary on port {recipient_node.port} to become available."
                    self.logger.error(msg)
                    raise self.fixturelib.ServerFailure(msg)

                self.logger.info(
                    f"Waiting for secondary on port {recipient_node.port} to become available.")
                try:
                    is_secondary = recipient_client.admin.command("isMaster")["secondary"]
                    if is_secondary:
                        break
                except pymongo.errors.OperationFailure as err:
                    if err.code != ShardSplitFixture._INTERRUPTED_DUE_TO_STORAGE_CHANGE:
                        raise

                time.sleep(0.1)  # Wait a little bit before trying again.
            self.logger.info(f"Secondary on port {recipient_node.port} is now available.")

    def remove_recipient_nodes(self, recipient_tag_name=None):
        """Remove recipient nodes from the donor."""
        recipient_tag_name = recipient_tag_name or "recipientNode"
        donor_rs_name = self.get_donor_rs().replset_name
        recipient_nodes = self.get_recipient_nodes()
        with self.__lock:
            # Reset the port-set, so we select the same ports next time.
            self._port_index ^= 1

            # Remove the recipient nodes from the internal fixture list.
            donor_rs = next(iter(self.fixtures), None)
            if donor_rs and not _is_replica_set_fixture(donor_rs):
                raise ValueError("Invalid configuration, donor_rs is not a ReplicaSetFixture")
            self.fixtures = [donor_rs]

        donor_client = self._create_client(self.get_donor_rs())
        repl_config = with_naive_retry(lambda: donor_client.admin.command({"replSetGetConfig": 1})[
            "config"])
        repl_members = [
            member for member in repl_config["members"]
            if not 'tags' in member or not recipient_tag_name in member["tags"]
        ]

        # Re-index all members from 0
        for idx, member in enumerate(repl_members):
            member["_id"] = idx

        # Prepare the new config
        repl_config["version"] = repl_config["version"] + 1
        repl_config["members"] = repl_members

        # It's possible that the recipient config has been removed in a previous remove attempt.
        if "recipientConfig" in repl_config:
            del repl_config["recipientConfig"]

        self.logger.info(
            f"Reconfiguring donor '{donor_rs_name}' to remove recipient nodes: {repl_config}")
        with_naive_retry(lambda: donor_client.admin.command({
            "replSetReconfig": repl_config, "maxTimeMS": self.AWAIT_REPL_TIMEOUT_MINS * 60 * 1000
        }))

        self.logger.info("Tearing down recipient nodes and removing data directories.")
        for recipient_node in reversed(recipient_nodes):
            _teardown_and_clean_fixture(recipient_node)

    def replace_donor_with_recipient(self, recipient_set_name):
        """Replace the current donor with the newly initiated recipient."""
        self.logger.info(
            f"Making new donor replica set '{recipient_set_name}' from existing recipient nodes.")
        mongod_options = self.common_mongod_options.copy()
        mongod_options["dbpath"] = os.path.join(self._dbpath_prefix, recipient_set_name)
        mongod_options["serverless"] = True
        new_donor_rs = self.fixturelib.make_fixture(
            "ReplicaSetFixture", self.logger, self.job_num, mongod_options=mongod_options,
            preserve_dbpath=self.preserve_dbpath, num_nodes=self.num_nodes_per_replica_set,
            auth_options=self.auth_options, replset_config_options=self.replset_config_options,
            mixed_bin_versions=self.mixed_bin_versions,
            replicaset_logging_prefix=recipient_set_name, all_nodes_electable=True,
            replset_name=recipient_set_name, existing_nodes=self.get_recipient_nodes())

        new_donor_rs.get_primary()  # Await an election of a new donor primary

        self.logger.info("Replacing internal fixtures with new donor replica set.")
        retired_donor_rs = self.get_donor_rs()
        with self.__lock:
            self.fixtures = [new_donor_rs]

        self._can_teardown_retired_donor_rs.wait()
        self.logger.info(f"Retiring old donor replica set '{retired_donor_rs.replset_name}'.")
        _teardown_and_clean_fixture(retired_donor_rs)

    def enter_step_down(self):
        """Called by the ContinuousStepDown hook to indicate that we are stepping down."""
        self.logger.info("Entering stepdown, preventing donor from being retired.")
        self._can_teardown_retired_donor_rs.clear()

    def exit_step_down(self):
        """Called by the ContinuousStepDown hook to indicate that we are done stepping down."""
        self.logger.info("Exiting stepdown, donor can now be retired.")
        self._can_teardown_retired_donor_rs.set()