SERVER-119173: Collect bazel debug information on timeouts (#47724)
GitOrigin-RevId: 3586fd086ea3888368091d830f2ddd22428fb63f
This commit is contained in:
@@ -609,6 +609,24 @@ task_groups:
|
||||
permissions: private
|
||||
visibility: signed
|
||||
content_type: application/json
|
||||
- func: "f_expansions_write"
|
||||
- command: subprocess.exec
|
||||
display_name: "collect bazel debug logs"
|
||||
params:
|
||||
binary: bash
|
||||
args:
|
||||
- "src/evergreen/collect_bazel_debug_logs.sh"
|
||||
- command: s3.put
|
||||
params:
|
||||
optional: true
|
||||
aws_key: ${aws_key}
|
||||
aws_secret: ${aws_secret}
|
||||
local_file: src/bazel-debug-logs.zip
|
||||
remote_file: ${project}/${version_id}/${build_variant}/${task_name}/bazel-debug-logs-${execution}.zip
|
||||
bucket: mciuploads
|
||||
permissions: private
|
||||
visibility: signed
|
||||
content_type: application/zip
|
||||
- func: "debug full disk"
|
||||
- func: "attach bazel invocation"
|
||||
- func: "save failed tests"
|
||||
|
||||
@@ -228,6 +228,28 @@ bazel_evergreen_shutils::print_bazel_server_pid() {
|
||||
fi
|
||||
}
|
||||
|
||||
bazel_evergreen_shutils::jstack_bazel() {
|
||||
# Find all bazel processes (Java processes with "bazel" in command line)
|
||||
local pids
|
||||
pids=$(pgrep -f "java.*bazel" || true)
|
||||
if [[ -z "$pids" ]]; then
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Skip if jstack is not available
|
||||
if ! command -v jstack >/dev/null 2>&1; then
|
||||
return 1
|
||||
fi
|
||||
|
||||
local timestamp
|
||||
timestamp=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
for pid in $pids; do
|
||||
local output_file="bazel_jstack_${timestamp}_pid${pid}.txt"
|
||||
jstack "$pid" >"$output_file" 2>&1
|
||||
done
|
||||
}
|
||||
|
||||
# Starts server (if needed) and prints PID. Safe to call multiple times.
|
||||
bazel_evergreen_shutils::ensure_server_and_print_pid() {
|
||||
local BAZEL_BINARY="$1"
|
||||
@@ -300,18 +322,45 @@ bazel_evergreen_shutils::retry_bazel_cmd() {
|
||||
cmd+=" ${OOM_GUARD_FLAG}"
|
||||
fi
|
||||
|
||||
local jstack_dumper_pid=""
|
||||
|
||||
# Prefix timeout, if any.
|
||||
if [[ -n "$timeout_str" ]]; then
|
||||
cmd="${timeout_str} ${cmd}"
|
||||
|
||||
# Start a background monitor to run jstack 5 seconds before the timeout will expire.
|
||||
# This is useful information for debugging a rare hang in bazel where the build gets
|
||||
# stuck.
|
||||
local timeout_duration
|
||||
timeout_duration=$(echo "$timeout_str" | awk '{print $NF}')
|
||||
if [[ $timeout_duration -gt 5 ]]; then
|
||||
set -m # Enable job control to create a process group
|
||||
(
|
||||
sleep $((timeout_duration - 5))
|
||||
bazel_evergreen_shutils::jstack_bazel "$BAZEL_BINARY" || true
|
||||
) &
|
||||
jstack_dumper_pid=$!
|
||||
set +m # Disable job control
|
||||
fi
|
||||
fi
|
||||
|
||||
# Run it.
|
||||
# NOTE: We *do not* add any redirections here; caller controls logging completely.
|
||||
if eval $env "$cmd"; then
|
||||
RET=0
|
||||
# Kill the jstack dumper if still running
|
||||
if [[ -n "$jstack_dumper_pid" ]]; then
|
||||
kill -- -$jstack_dumper_pid 2>/dev/null || true
|
||||
wait $jstack_dumper_pid 2>/dev/null || true
|
||||
fi
|
||||
break
|
||||
else
|
||||
RET=$?
|
||||
# Kill the jstack dumper if still running
|
||||
if [[ -n "$jstack_dumper_pid" ]]; then
|
||||
kill -- -$jstack_dumper_pid 2>/dev/null || true
|
||||
wait $jstack_dumper_pid 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
|
||||
if ! bazel_evergreen_shutils::is_bazel_server_running "$BAZEL_BINARY"; then
|
||||
|
||||
80
evergreen/collect_bazel_debug_logs.sh
Executable file
80
evergreen/collect_bazel_debug_logs.sh
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env bash
|
||||
# Collects bazel debug logs (jstack files, command.log, java.log) and zips them
|
||||
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
. "$DIR/prelude.sh"
|
||||
. "$DIR/bazel_evergreen_shutils.sh"
|
||||
|
||||
set -o errexit
|
||||
set -o pipefail
|
||||
|
||||
cd src
|
||||
|
||||
collect_bazel_debug_logs() {
|
||||
BAZEL_BINARY="$(bazel_evergreen_shutils::bazel_get_binary_path)"
|
||||
|
||||
# Get output_base
|
||||
local ob
|
||||
ob="$(bazel_evergreen_shutils::bazel_output_base "$BAZEL_BINARY")" || {
|
||||
echo "Unable to get bazel output_base" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
local zip_file="bazel-debug-logs.zip"
|
||||
|
||||
echo "Collecting bazel debug logs into $zip_file" >&2
|
||||
|
||||
# Check if zip command is available
|
||||
if ! command -v zip >/dev/null 2>&1; then
|
||||
echo "zip command not found; cannot create archive" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Create temporary list of files to zip
|
||||
local files_to_zip=()
|
||||
|
||||
# Collect jstack files from current directory
|
||||
local jstack_files
|
||||
jstack_files=$(find . -maxdepth 1 -name "bazel_jstack_*.txt" -type f 2>/dev/null || true)
|
||||
if [[ -z "$jstack_files" ]]; then
|
||||
echo "No jstack files found; nothing to collect" >&2
|
||||
return 0
|
||||
fi
|
||||
|
||||
while IFS= read -r file; do
|
||||
if [[ -n "$file" ]]; then
|
||||
files_to_zip+=("$file")
|
||||
echo "Found jstack file: $file" >&2
|
||||
fi
|
||||
done <<<"$jstack_files"
|
||||
|
||||
# Collect command.log from output_base
|
||||
local command_log="${ob}/command.log"
|
||||
if [[ -f "$command_log" ]]; then
|
||||
files_to_zip+=("$command_log")
|
||||
echo "Found command.log: $command_log" >&2
|
||||
fi
|
||||
|
||||
# Collect java.log from output_base
|
||||
local java_log="${ob}/java.log"
|
||||
if [[ -f "$java_log" ]]; then
|
||||
files_to_zip+=("$java_log")
|
||||
echo "Found java.log: $java_log" >&2
|
||||
fi
|
||||
|
||||
# Check if we have any files to zip
|
||||
if [[ ${#files_to_zip[@]} -eq 0 ]]; then
|
||||
echo "No debug files found to collect" >&2
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Create the zip file
|
||||
echo "Creating archive with ${#files_to_zip[@]} file(s)..." >&2
|
||||
if zip -q -j "$zip_file" "${files_to_zip[@]}" 2>&1; then
|
||||
echo "Debug logs archived to $(pwd)/$zip_file" >&2
|
||||
else
|
||||
echo "Failed to create zip archive" >&2
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
collect_bazel_debug_logs
|
||||
Reference in New Issue
Block a user