Files
mongo/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py
Nick Jefferies a4130ea9ab SERVER-119906: Enable UP006 ruff rule (#48284)
GitOrigin-RevId: 2069fa7bda111a89d4a9a43a56e71f06cc2e9a7c
2026-02-20 18:55:17 +00:00

254 lines
10 KiB
Python
Executable File

"""Hang Analyzer module.
A prototype hang analyzer for Evergreen integration to help investigate test timeouts
1. Script supports taking dumps, and/or dumping a summary of useful information about a process
2. Script will iterate through a list of interesting processes,
and run the tools from step 1. The list of processes can be provided as an option.
3. Java processes will be dumped using jstack, if available.
Supports Linux, MacOS X, and Windows.
"""
import getpass
import logging
import os
import platform
import re
import signal
import sys
import traceback
import distro
import psutil
from buildscripts.resmokelib import config as resmoke_config
from buildscripts.resmokelib.hang_analyzer import dumper, process, process_list
from buildscripts.resmokelib.plugin import Subcommand
class HangAnalyzer(Subcommand):
"""Main class for the hang analyzer subcommand."""
def __init__(self, options: dict, task_id=None, logger=None, **_kwargs):
"""
Configure process lists based on options.
:param options: Options as parsed by parser.py
:param logger: Logger to be used. If not specified, one will be created.
:param kwargs: additional args
"""
self.options = options
self.root_logger = None
self.interesting_processes: list[str] = [
# Remove "python", "java" from the list to avoid hang analyzer multiple invocations
"mongo",
"mongod",
"mongos",
"_test",
"dbtest",
]
self.go_processes: list[str] = []
self.process_ids: list[int] = []
def configure_task_id():
run_tid = resmoke_config.EVERGREEN_TASK_ID
hang_analyzer_tid = task_id
if run_tid and hang_analyzer_tid and run_tid != hang_analyzer_tid:
raise ValueError(
"The Evergreen Task ID (tid) should be either passed in through `resmoke.py run` "
"or through `resmoke.py hang-analyzer` but not both. run tid: %s, hang-analyzer tid: %s"
% (run_tid, hang_analyzer_tid)
)
return run_tid or hang_analyzer_tid
self.task_id = configure_task_id()
self._configure_processes()
self._setup_logging(logger)
def kill_rogue_processes(self):
"""Kill any processes that are currently being analyzed."""
processes = process_list.get_processes(
self.process_ids,
self.interesting_processes,
self.options["process_match"],
self.root_logger,
)
process.teardown_processes(self.root_logger, processes, dump_pids={})
def execute(self):
"""
Execute hang analysis.
1. Get a list of interesting processes
2. Dump useful information or take core dumps
"""
self._log_system_info()
dumpers = dumper.get_dumpers(
self.root_logger,
self.options["debugger_output"],
include_terminating=self.options["kill_processes"],
)
processes = process_list.get_processes(
self.process_ids,
self.interesting_processes,
self.options["process_match"],
self.root_logger,
)
def is_python_process(pname: str):
# "live-record*" and "python*" are Python processes. Sending SIGUSR1 causes resmoke.py
# to dump its stack and run the hang analyzer on its child processes.
# Sending SIGUSR1 causes live-record to save its recording and terminate.
return pname.startswith("python") or pname.startswith("live-record")
# Suspending all processes, except python, to prevent them from getting unstuck when
# the hang analyzer attaches to them.
for pinfo in [pinfo for pinfo in processes if not is_python_process(pinfo.name)]:
for pid in pinfo.pidv:
process.pause_process(self.root_logger, pinfo.name, pid)
# Dump python processes by signalling them. The resmoke.py process will generate
# the report.json, when signalled, so we do this before attaching to other processes.
for pinfo in [pinfo for pinfo in processes if is_python_process(pinfo.name)]:
for pid in pinfo.pidv:
process.signal_python(self.root_logger, pinfo.name, pid)
trapped_exceptions = []
dump_pids = {}
# Dump core files of all processes, except python & java.
if self.options["dump_core"]:
take_core_processes = [
pinfo for pinfo in processes if not re.match("^(java|python)", pinfo.name)
]
for pinfo in take_core_processes:
if self._check_enough_free_space():
try:
dumpers.dbg.dump_info(pinfo, take_dump=True)
except dumper.DumpError as err:
self.root_logger.error(err.message)
dump_pids = {**err.dump_pids, **dump_pids}
except Exception as err:
self.root_logger.info("Error encountered when invoking debugger %s", err)
trapped_exceptions.append(traceback.format_exc())
else:
self.root_logger.info(
"Not enough space for a core dump, skipping %s processes with PIDs %s",
pinfo.name,
str(pinfo.pidv),
)
# Dump info of all processes, except python & java.
for pinfo in [pinfo for pinfo in processes if not re.match("^(java|python)", pinfo.name)]:
try:
dumpers.dbg.dump_info(pinfo, take_dump=False)
except Exception as err:
self.root_logger.info("Error encountered when invoking debugger %s", err)
trapped_exceptions.append(traceback.format_exc())
# Dump java processes using jstack.
for pinfo in [pinfo for pinfo in processes if pinfo.name.startswith("java")]:
for pid in pinfo.pidv:
try:
dumpers.jstack.dump_info(
self.root_logger, self.options["debugger_output"], pinfo.name, pid
)
except Exception as err:
self.root_logger.info("Error encountered when invoking debugger %s", err)
trapped_exceptions.append(traceback.format_exc())
# Signal go processes to ensure they print out stack traces, and die on POSIX OSes.
# On Windows, this will simply kill the process since python emulates SIGABRT as
# TerminateProcess.
# Note: The stacktrace output may be captured elsewhere (i.e. resmoke).
for pinfo in [pinfo for pinfo in processes if pinfo.name in self.go_processes]:
for pid in pinfo.pidv:
self.root_logger.info(
"Sending signal SIGABRT to go process %s with PID %d", pinfo.name, pid
)
process.signal_process(self.root_logger, pid, signal.SIGABRT)
self.root_logger.info("Done analyzing all processes for hangs")
# Kill and abort processes if "-k" was specified.
if self.options["kill_processes"]:
process.teardown_processes(self.root_logger, processes, dump_pids)
else:
# Resuming all suspended processes.
for pinfo in [pinfo for pinfo in processes if not pinfo.name.startswith("python")]:
for pid in pinfo.pidv:
process.resume_process(self.root_logger, pinfo.name, pid)
for exception in trapped_exceptions:
self.root_logger.info(exception)
if trapped_exceptions:
raise RuntimeError(
"Exceptions were thrown while dumping. There may still be some valid dumps."
)
def _configure_processes(self):
if self.options["debugger_output"] is None:
self.options["debugger_output"] = ["stdout"]
# add != "" check to avoid empty process_ids
if self.options["process_ids"] is not None and self.options["process_ids"] != "":
# self.process_ids is an int list of PIDs
self.process_ids = [int(pid) for pid in self.options["process_ids"].split(",")]
if self.options["process_names"] is not None:
self.interesting_processes = self.options["process_names"].split(",")
if self.options["go_process_names"] is not None:
self.go_processes = self.options["go_process_names"].split(",")
self.interesting_processes += self.go_processes
def _setup_logging(self, logger):
if logger is None:
self.root_logger = logging.Logger("hang_analyzer", level=logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(fmt="%(message)s"))
self.root_logger.addHandler(handler)
else:
self.root_logger = logger
def _log_system_info(self):
self.root_logger.info("Python Version: %s", sys.version)
self.root_logger.info("OS: %s", platform.platform())
try:
if sys.platform in ["win32", "cygwin"]:
self.root_logger.info("Windows Distribution: %s", platform.win32_ver())
else:
self.root_logger.info(
"Linux Distribution: %s %s (%s)",
distro.name(),
distro.version(),
distro.codename(),
)
except AttributeError:
self.root_logger.warning("Cannot determine Linux distro since Python is too old")
try:
uid = os.getuid()
self.root_logger.info("Current UID: %s", uid)
current_login = getpass.getuser()
self.root_logger.info("Current Login: %s", current_login)
except AttributeError:
self.root_logger.warning(
"Cannot determine Unix Current Login, not supported on Windows"
)
except (KeyError, OSError):
# The error from getpass.getuser() when there is no username for a UID.
self.root_logger.warning("No username set for the current UID.")
def _check_enough_free_space(self):
usage_percent = psutil.disk_usage(".").percent
self.root_logger.info("Current disk usage percent: %s", usage_percent)
return usage_percent < self.options["max_disk_usage_percent"]