SERVER-119173: Collect bazel debug information on timeouts (#47724)

GitOrigin-RevId: 3586fd086ea3888368091d830f2ddd22428fb63f
This commit is contained in:
Sean Lyons
2026-02-06 14:56:18 -05:00
committed by MongoDB Bot
parent 07e34a3950
commit 77da337447
3 changed files with 147 additions and 0 deletions

View File

@@ -609,6 +609,24 @@ task_groups:
permissions: private
visibility: signed
content_type: application/json
- func: "f_expansions_write"
- command: subprocess.exec
display_name: "collect bazel debug logs"
params:
binary: bash
args:
- "src/evergreen/collect_bazel_debug_logs.sh"
- command: s3.put
params:
optional: true
aws_key: ${aws_key}
aws_secret: ${aws_secret}
local_file: src/bazel-debug-logs.zip
remote_file: ${project}/${version_id}/${build_variant}/${task_name}/bazel-debug-logs-${execution}.zip
bucket: mciuploads
permissions: private
visibility: signed
content_type: application/zip
- func: "debug full disk"
- func: "attach bazel invocation"
- func: "save failed tests"

View File

@@ -228,6 +228,28 @@ bazel_evergreen_shutils::print_bazel_server_pid() {
fi
}
bazel_evergreen_shutils::jstack_bazel() {
# Find all bazel processes (Java processes with "bazel" in command line)
local pids
pids=$(pgrep -f "java.*bazel" || true)
if [[ -z "$pids" ]]; then
return 1
fi
# Skip if jstack is not available
if ! command -v jstack >/dev/null 2>&1; then
return 1
fi
local timestamp
timestamp=$(date +%Y%m%d_%H%M%S)
for pid in $pids; do
local output_file="bazel_jstack_${timestamp}_pid${pid}.txt"
jstack "$pid" >"$output_file" 2>&1
done
}
# Starts server (if needed) and prints PID. Safe to call multiple times.
bazel_evergreen_shutils::ensure_server_and_print_pid() {
local BAZEL_BINARY="$1"
@@ -300,18 +322,45 @@ bazel_evergreen_shutils::retry_bazel_cmd() {
cmd+=" ${OOM_GUARD_FLAG}"
fi
local jstack_dumper_pid=""
# Prefix timeout, if any.
if [[ -n "$timeout_str" ]]; then
cmd="${timeout_str} ${cmd}"
# Start a background monitor to run jstack 5 seconds before the timeout will expire.
# This is useful information for debugging a rare hang in bazel where the build gets
# stuck.
local timeout_duration
timeout_duration=$(echo "$timeout_str" | awk '{print $NF}')
if [[ $timeout_duration -gt 5 ]]; then
set -m # Enable job control to create a process group
(
sleep $((timeout_duration - 5))
bazel_evergreen_shutils::jstack_bazel "$BAZEL_BINARY" || true
) &
jstack_dumper_pid=$!
set +m # Disable job control
fi
fi
# Run it.
# NOTE: We *do not* add any redirections here; caller controls logging completely.
if eval $env "$cmd"; then
RET=0
# Kill the jstack dumper if still running
if [[ -n "$jstack_dumper_pid" ]]; then
kill -- -$jstack_dumper_pid 2>/dev/null || true
wait $jstack_dumper_pid 2>/dev/null || true
fi
break
else
RET=$?
# Kill the jstack dumper if still running
if [[ -n "$jstack_dumper_pid" ]]; then
kill -- -$jstack_dumper_pid 2>/dev/null || true
wait $jstack_dumper_pid 2>/dev/null || true
fi
fi
if ! bazel_evergreen_shutils::is_bazel_server_running "$BAZEL_BINARY"; then

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env bash
# Collects bazel debug logs (jstack files, command.log, java.log) and zips them
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
. "$DIR/prelude.sh"
. "$DIR/bazel_evergreen_shutils.sh"
set -o errexit
set -o pipefail
cd src
collect_bazel_debug_logs() {
BAZEL_BINARY="$(bazel_evergreen_shutils::bazel_get_binary_path)"
# Get output_base
local ob
ob="$(bazel_evergreen_shutils::bazel_output_base "$BAZEL_BINARY")" || {
echo "Unable to get bazel output_base" >&2
return 1
}
local zip_file="bazel-debug-logs.zip"
echo "Collecting bazel debug logs into $zip_file" >&2
# Check if zip command is available
if ! command -v zip >/dev/null 2>&1; then
echo "zip command not found; cannot create archive" >&2
return 1
fi
# Create temporary list of files to zip
local files_to_zip=()
# Collect jstack files from current directory
local jstack_files
jstack_files=$(find . -maxdepth 1 -name "bazel_jstack_*.txt" -type f 2>/dev/null || true)
if [[ -z "$jstack_files" ]]; then
echo "No jstack files found; nothing to collect" >&2
return 0
fi
while IFS= read -r file; do
if [[ -n "$file" ]]; then
files_to_zip+=("$file")
echo "Found jstack file: $file" >&2
fi
done <<<"$jstack_files"
# Collect command.log from output_base
local command_log="${ob}/command.log"
if [[ -f "$command_log" ]]; then
files_to_zip+=("$command_log")
echo "Found command.log: $command_log" >&2
fi
# Collect java.log from output_base
local java_log="${ob}/java.log"
if [[ -f "$java_log" ]]; then
files_to_zip+=("$java_log")
echo "Found java.log: $java_log" >&2
fi
# Check if we have any files to zip
if [[ ${#files_to_zip[@]} -eq 0 ]]; then
echo "No debug files found to collect" >&2
return 0
fi
# Create the zip file
echo "Creating archive with ${#files_to_zip[@]} file(s)..." >&2
if zip -q -j "$zip_file" "${files_to_zip[@]}" 2>&1; then
echo "Debug logs archived to $(pwd)/$zip_file" >&2
else
echo "Failed to create zip archive" >&2
return 1
fi
}
collect_bazel_debug_logs