Files
mongo/buildscripts/resmokelib/hang_analyzer/extractor.py
Sean Lyons 8ea21ae1d3 SERVER-117045 Trigger core analysis tasks from bazel resmoke tasks (#47599)
GitOrigin-RevId: 3d697ff268a027bf991f42a9a066bc731e330a25
2026-02-05 21:38:37 +00:00

1005 lines
39 KiB
Python

"""Extracts `mongo-debugsymbols.tgz` in an idempotent manner for performance."""
import concurrent.futures
import glob
import gzip
import json
import os
import re
import shutil
import subprocess
import sys
import tarfile
import time
import urllib.request
import zipfile
from logging import Logger
from pathlib import Path
from typing import Callable, Optional
from opentelemetry import trace
from opentelemetry.trace.status import StatusCode
from retry import retry
from buildscripts.create_rbe_sysroot import create_rbe_sysroot
from buildscripts.resmokelib.hang_analyzer.dumper import Dumper
from buildscripts.resmokelib.setup_multiversion.download import (
DownloadError,
download_from_s3_with_requests,
)
from buildscripts.resmokelib.setup_multiversion.setup_multiversion import (
SetupMultiversion,
_DownloadOptions,
)
from buildscripts.resmokelib.symbolizer import Symbolizer
from buildscripts.resmokelib.utils import evergreen_conn
from buildscripts.resmokelib.utils.filesystem import build_hygienic_bin_path
from buildscripts.resmokelib.utils.otel_thread_pool_executor import OtelThreadPoolExecutor
from buildscripts.resmokelib.utils.otel_utils import get_default_current_span
from buildscripts.resmokelib.utils.runtime_recorder import compare_start_time
from evergreen.task import Artifact, Task
_DEBUG_FILE_BASE_NAMES = ["mongo", "mongod", "mongos"]
TOOLCHAIN_ROOT = "/opt/mongodbtoolchain/v5"
TRACER = trace.get_tracer("resmoke")
def run_with_retries(
root_logger: Logger, func: Callable[..., bool], timeout_secs: int, retry_secs: int, **kwargs
) -> bool:
start_time = time.time()
while True:
try:
if func(root_logger=root_logger, **kwargs):
return True
except Exception as ex:
root_logger.error(f"Exception hit while running {func.__name__}")
root_logger.error(ex)
time_difference = time.time() - start_time
if time_difference > timeout_secs:
root_logger.error(
f"Timeout hit for function {func.__name__} after {time_difference} seconds."
)
return False
root_logger.error(f"Failed to run {func.__name__}, retrying in {retry_secs} seconds...")
time.sleep(retry_secs)
@TRACER.start_as_current_span("core_analyzer.download_core_dumps")
def download_core_dumps(
root_logger: Logger, task: Task, download_dir: str, debugger: Dumper, multiversion_versions: set
) -> bool:
root_logger.info("Looking for core dumps")
artifacts = task.artifacts
core_dumps_found = 0
core_dumps_dir = os.path.join(download_dir, "core-dumps")
os.makedirs(core_dumps_dir, exist_ok=True)
current_span = get_default_current_span()
for artifact in artifacts:
if not artifact.name.startswith("Core Dump"):
continue
file_name = artifact.url.split("/")[-1]
extracted_name, _ = os.path.splitext(file_name)
extract_path = os.path.join(core_dumps_dir, extracted_name)
with TRACER.start_as_current_span(
"core_analyzer.download_core_dump",
attributes={
"core_dump_file_name": file_name,
"core_dump_file_url": artifact.url,
},
) as core_dump_span:
attempts = 0
core_dump_span.set_status(StatusCode.OK)
try:
@retry(tries=3, delay=5)
def download_core_dump(artifact: Artifact):
nonlocal attempts
attempts += 1
root_logger.info(f"Downloading core dump: {file_name}")
if os.path.exists(file_name):
os.remove(file_name)
urllib.request.urlretrieve(artifact.url, file_name)
root_logger.info(f"Extracting core dump: {file_name}")
if os.path.exists(extract_path):
os.remove(extract_path)
with gzip.open(file_name, "rb") as f_in:
with open(extract_path, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
core_dump_span.set_attributes(
{
"core_dump_compressed_size": os.path.getsize(file_name),
"core_dump_extracted_size": os.path.getsize(extract_path),
"core_dump_download_attempts": attempts,
}
)
root_logger.info(
f"Done extracting core dump {extracted_name} to {extract_path}"
)
os.remove(file_name)
_, bin_version = debugger.get_binary_from_core_dump(extract_path)
if bin_version:
multiversion_versions.add(bin_version)
download_core_dump(artifact)
core_dumps_found += 1
except Exception as ex:
root_logger.error(
"An error occured while trying to download and extract core dump %s",
extracted_name,
)
root_logger.error(ex)
core_dump_span.set_status(StatusCode.ERROR, "Failed to download core dump.")
core_dump_span.set_attributes(
{
"core_dump_download_attempts": attempts,
"core_dump_error": ex,
}
)
core_dump_directory_size = sum(f.stat().st_size for f in Path("./").glob("**/*") if f.is_file())
current_span.set_attributes(
{
"core_dumps_dir": core_dumps_dir,
"core_dump_directory_size": core_dump_directory_size,
}
)
if not core_dumps_found:
root_logger.error("No core dumps found")
current_span.set_status(StatusCode.ERROR, description="No core dumps found")
current_span.set_attributes(
{
"core_dumps_error": "No core dumps found",
}
)
return False
return True
@TRACER.start_as_current_span("core_analyzer.download_multiversion_artifact")
def download_multiversion_artifact(
root_logger: Logger,
version_id: str,
variant: str,
download_options: _DownloadOptions,
download_dir: str,
name: str,
bin_version: str = None,
) -> bool:
current_span = get_default_current_span(
{"downloaded_artifact_type": name, "version": bin_version if bin_version else "current"}
)
try:
root_logger.info("Downloading %s", name)
multiversion_setup = SetupMultiversion(
download_options=download_options,
ignore_failed_push=True,
link_dir=os.path.abspath(download_dir),
)
urlinfo = multiversion_setup.get_urls(version=version_id, buildvariant_name=variant)
if bin_version:
install_dir = os.path.abspath(os.path.join(download_dir, bin_version, "install"))
os.makedirs(install_dir, exist_ok=True)
multiversion_setup.download_and_extract_from_urls(
urlinfo.urls, bin_suffix=bin_version, install_dir=install_dir, skip_symlinks=False
)
else:
install_dir = os.path.abspath(os.path.join(download_dir, "install"))
os.makedirs(install_dir, exist_ok=True)
multiversion_setup.download_and_extract_from_urls(
urlinfo.urls, bin_suffix=None, install_dir=install_dir, skip_symlinks=True
)
root_logger.info("Downloaded %s", name)
return True
except Exception as ex:
root_logger.error("An error occured while trying to download %s", name)
root_logger.error(ex)
current_span.set_status(StatusCode.ERROR, f"Failed to download {name}")
current_span.set_attribute("download_multiversion_artifact_error", ex)
return False
def get_dwarf_version(dwarf_dump):
"""
Find the dwarf version of a compile unit that was produced by a MongoDB compiler or
has mongodb sources in its name.
See a segment of a dwarf dump in buildscripts/tests/resmokelib/hang_analyzer/test_extractor.py
"""
# Checks if a compile unit contains "mongo" or "src/third_party".
# The DW_AT_producer tag will contain the compiler used, which may be stamped with 'MongoDB' for our
# toolchain compilers.
# The DW_AT_name can contain the source file, for which we hope to find 'src/mongo' or 'src/third_party'
mongo_pattern = r"mongo|src/third_party"
version_pattern = r"version = 0x([0-9]{4}),"
for unit in dwarf_dump.split("Compile Unit"):
if re.search(mongo_pattern, unit, re.IGNORECASE):
matches = re.search(version_pattern, unit)
if matches:
return int(matches.group(1))
# If we didn't find any compile unit that looks like the patterns, fallback to ANY dwarf version
# listed in the output.
matches = re.search(version_pattern, dwarf_dump)
if matches:
return int(matches.group(1))
return None
@TRACER.start_as_current_span("core_analyzer.post_install_gdb_optimization")
def post_install_gdb_optimization(download_dir: str, root_looger: Logger):
@TRACER.start_as_current_span("core_analyzer.post_install_gdb_optimization.add_index")
def add_index(file_path: str):
"""Generate and add gdb-index to ELF binary."""
current_span = get_default_current_span(
{
"file": file_path,
"add_index_status": "success",
"add_index_original_file_size": os.path.getsize(file_path),
}
)
start_time = time.time()
process = subprocess.run(
[f"{TOOLCHAIN_ROOT}/bin/llvm-dwarfdump", "-r", "0", file_path],
capture_output=True,
text=True,
)
# it is normal for non debug binaries to fail this command
# there also can be some python files in the bin dir that will fail
if process.returncode != 0:
current_span.set_attribute("add_index_status", "skipped")
return
version = get_dwarf_version(process.stdout)
if version:
current_span.set_attribute("dwarf_version", version)
else:
current_span.set_status(StatusCode.ERROR, "Could not find dwarf version in file.")
current_span.set_attributes(
{
"add_index_status": "failed",
"add_index_error": "Could not find dwarf version in file",
}
)
# The file has no debug info if stdout ends like this
if process.stdout.strip().endswith(".debug_info contents:"):
return
raise RuntimeError(f"Could not find dwarf version in file {file_path}")
target_dir = os.path.dirname(file_path)
try:
# logic copied from https://sourceware.org/gdb/onlinedocs/gdb/Index-Files.html
if version == 5:
subprocess.run(
[
f"{TOOLCHAIN_ROOT}/bin/gdb",
"--batch-silent",
"--quiet",
"--nx",
"--eval-command",
f"save gdb-index -dwarf-5 {target_dir}",
file_path,
],
check=True,
)
# objcopy overwrites the input executable when only given one positional argument.
# /dev/null is specified as the second positional argument to simultaneously prevent
# the debug symbol file from being overwritten and to discard the generated copy.
subprocess.run(
[
f"{TOOLCHAIN_ROOT}/bin/objcopy",
"--dump-section",
f".debug_str={file_path}.debug_str.new",
file_path,
"/dev/null",
],
check=True,
)
with open(f"{file_path}.debug_str", "r") as file1:
with open(f"{file_path}.debug_str.new", "a") as file2:
file2.write(file1.read())
# Ensure the file is writable, we have seen the uploaded binaries be non-writable.
os.chmod(file_path, 0o775)
subprocess.run(
[
f"{TOOLCHAIN_ROOT}/bin/objcopy",
"--add-section",
f".debug_names={file_path}.debug_names",
"--set-section-flags",
".debug_names=readonly",
"--update-section",
f".debug_str={file_path}.debug_str.new",
file_path,
file_path,
],
check=True,
)
os.remove(f"{file_path}.debug_str.new")
os.remove(f"{file_path}.debug_str")
os.remove(f"{file_path}.debug_names")
elif version == 4:
subprocess.run(
[
f"{TOOLCHAIN_ROOT}/bin/gdb",
"--batch-silent",
"--quiet",
"--nx",
"--eval-command",
f"save gdb-index {target_dir}",
file_path,
],
check=True,
)
subprocess.run(
[
f"{TOOLCHAIN_ROOT}/bin/objcopy",
"--add-section",
f".gdb_index={file_path}.gdb-index",
"--set-section-flags",
".gdb_index=readonly",
file_path,
file_path,
],
check=True,
)
os.remove(f"{file_path}.gdb-index")
else:
current_span.set_status(StatusCode.ERROR, f"Unsupported dwarf version: {version}")
current_span.set_attributes(
{
"add_index_status": "failed",
"add_index_error": "Does not support dwarf version",
}
)
raise RuntimeError(f"Does not support dwarf version {version}")
except Exception as ex:
root_looger.exception("Failed to add gdb index to %s", file_path)
current_span.set_status(StatusCode.ERROR, "Failed to add gdb index")
current_span.set_attributes(
{
"add_index_status": "failed",
"add_index_error": str(ex),
}
)
raise ex
current_span.set_attribute("add_index_changed_file_size", os.path.getsize(file_path))
root_looger.debug(
"Finished creating gdb-index for %s in %s", file_path, (time.time() - start_time)
)
@TRACER.start_as_current_span("core_analyzer.post_install_gdb_optimization.recalc_debuglink")
def recalc_debuglink(file_path: str):
"""
Recalcuate the debuglink for ELF binaries.
After creating the index file in a separate debug file, the debuglink CRC
is no longer valid, this will simply recreate the debuglink and therefore
update the CRC to match.
"""
current_span = get_default_current_span(
{"file": file_path, "recalc_debuglink_status": "success"}
)
process = subprocess.run(
[f"{TOOLCHAIN_ROOT}/bin/eu-readelf", "-n", file_path], capture_output=True, text=True
)
if process.returncode != 0:
current_span.set_attribute("recalc_debuglink_status", "skipped")
return
binary_build_id = get_build_id(process.stdout)
process = subprocess.run(
[f"{TOOLCHAIN_ROOT}/bin/eu-readelf", "-n", f"{file_path}.debug"],
capture_output=True,
text=True,
check=True,
)
debugsymbol_build_id = get_build_id(process.stdout)
if binary_build_id != debugsymbol_build_id:
print(f"{file_path} build id: {binary_build_id}")
print(f"{file_path}.debug build id: {debugsymbol_build_id}")
raise RuntimeError("binary and debugsymbol build ids do not match")
try:
# Ensure the file is writable, we have seen the uploaded binaries be non-writable.
os.chmod(file_path, 0o775)
subprocess.run(
[f"{TOOLCHAIN_ROOT}/bin/objcopy", "--remove-section", ".gnu_debuglink", file_path],
check=True,
)
subprocess.run(
[
f"{TOOLCHAIN_ROOT}/bin/objcopy",
"--add-gnu-debuglink",
f"{os.path.abspath(file_path)}.debug",
file_path,
],
check=True,
)
except Exception as ex:
root_looger.exception("Failed to recalculate debuglink")
current_span.set_status(StatusCode.ERROR, "Failed to recalculate debuglink")
current_span.set_attributes(
{
"recalc_debuglink_status": "failed",
"recalc_debuglink_error": str(ex),
}
)
raise ex
root_looger.debug("Finished recalculating the debuglink for %s", file_path)
install_dir = os.path.join(download_dir, "install")
if os.path.exists(os.path.join(install_dir, "dist-test")):
install_dir = os.path.join(install_dir, "dist-test")
bin_dir = os.path.join(install_dir, "bin")
bin_files = [os.path.join(bin_dir, file_path) for file_path in os.listdir(bin_dir)]
lib_dir = os.path.join(install_dir, "lib")
lib_files = []
if os.path.exists(lib_dir):
lib_files = [os.path.join(lib_dir, file_path) for file_path in os.listdir(lib_dir)]
current_span = get_default_current_span()
# add gdb indexes so gdb can process these libraries faster, we cannot do this to the binaries.
with OtelThreadPoolExecutor() as executor:
with TRACER.start_as_current_span(
"core_analyzer.post_install_gdb_optimization.add_indexes"
) as current_span:
futures = []
current_span.set_status(StatusCode.OK)
for file_path in lib_files:
# If a debug file exists for this file, then the debug symbols are
# over in that other file and we can skip this file.
if os.path.exists(f"{file_path}.debug"):
continue
# When we add the .gdb_index section to binaries being ran with gdb
# it makes gdb not longer recognize them as the binary that generated
# the core dumps
futures.append(executor.submit(add_index, file_path=file_path))
for future in concurrent.futures.as_completed(futures):
# raise any exceptions that were thrown in child processes
future.result()
# add gnu_debuglink section to all libraries and debugsymbols. This tells gdb where to look
# to find the debugsymbols.
with TRACER.start_as_current_span(
"core_analyzer.post_install_gdb_optimization.recalc_debuglink"
):
futures = []
current_span = get_default_current_span()
for file_path in lib_files + bin_files:
# There will be no debuglinks in the separate debug files
# We do not want to edit any of the binary files that gdb might directly run
# so we skip the bin directory
if file_path.endswith(".debug") or not os.path.exists(f"{file_path}.debug"):
continue
futures.append(executor.submit(recalc_debuglink, file_path=file_path))
for future in concurrent.futures.as_completed(futures):
# raise any exceptions that were thrown in child processes
future.result()
def get_build_id(process_output: str) -> str:
result = re.search("Build ID: ([0-9a-z]+)", process_output)
return result.group(1)
@TRACER.start_as_current_span("core_analyzer.download_task_artifacts")
def download_task_artifacts(
root_logger: Logger,
task_id: str,
download_dir: str,
debugger: Dumper,
multiversion_dir: str,
execution: Optional[int] = None,
retry_secs: int = 10,
download_timeout_secs: int = 30 * 60,
) -> bool:
if os.path.exists(download_dir):
# quick sanity check to ensure we don't delete a repo
if os.path.exists(os.path.join(download_dir, ".git")):
raise RuntimeError(f"Input dir cannot be a git repo: {download_dir}")
shutil.rmtree(download_dir)
root_logger.info(f"Deleted existing dir at {download_dir}")
os.mkdir(download_dir)
current_span = get_default_current_span({"download_task_id": task_id})
evg_api = evergreen_conn.get_evergreen_api()
if execution is not None:
task_info = evg_api.task_by_id(task_id=task_id, execution=execution)
else:
task_info = evg_api.task_by_id(task_id)
binary_download_options = _DownloadOptions(db=True, ds=False, da=False, dv=False)
debugsymbols_download_options = _DownloadOptions(db=False, ds=True, da=False, dv=False)
@retry(tries=3, delay=5)
def get_multiversion_download_links(task: Task) -> Optional[dict]:
for artifact in task.artifacts:
if artifact.name != "Multiversion download links":
continue
with urllib.request.urlopen(artifact.url) as url:
return json.load(url)
return None
# dictionary of versions and the information about where to download that version
# The key is the version, if the key is an empty string that means master was downloaded
multiversion_downloads = get_multiversion_download_links(task_info)
# We support `skip_compile` tasks that download master instead of compiling it
# For analysis on these tasks we should download the same binaries that the task downloaded
variant = task_info.build_variant
version_id = task_info.version_id
if multiversion_downloads:
skip_download = next(
filter(lambda v: v.get("bin_suffix") == "", multiversion_downloads), None
)
if skip_download:
version_id = skip_download["evg_urls_info"]["evg_version_id"]
variant = skip_download["evg_urls_info"]["evg_build_variant"]
if variant == "enterprise-rhel-8-64-bit-future-git-tag-multiversion":
# Tasks on this variant depend on multiple archive_dist_test tasks from the same version, so
# it is ambiguous which binaries/debug symbols to download. We want "-latest" for the "new" binaries,
# and -last-lts/-last-continuous will come from the multiversion download list for the "old".
variant = "linux-x86-dynamic-compile-future-tag-multiversion-latest"
all_downloaded = True
multiversion_versions = set()
with OtelThreadPoolExecutor() as executor:
futures = []
futures.append(
executor.submit(
run_with_retries,
root_logger=root_logger,
func=download_core_dumps,
timeout_secs=download_timeout_secs,
retry_secs=retry_secs,
task=task_info,
download_dir=download_dir,
debugger=debugger,
multiversion_versions=multiversion_versions,
)
)
futures.append(
executor.submit(
run_with_retries,
func=download_multiversion_artifact,
timeout_secs=download_timeout_secs,
retry_secs=retry_secs,
root_logger=root_logger,
version_id=version_id,
variant=variant,
download_options=binary_download_options,
download_dir=download_dir,
name="binaries",
)
)
futures.append(
executor.submit(
run_with_retries,
func=download_multiversion_artifact,
timeout_secs=download_timeout_secs,
retry_secs=retry_secs,
root_logger=root_logger,
version_id=version_id,
variant=variant,
download_options=debugsymbols_download_options,
download_dir=download_dir,
name="debugsymbols",
)
)
for future in concurrent.futures.as_completed(futures):
if not future.result():
current_span.set_status(StatusCode.ERROR, "Errors occured while fetching artifacts")
current_span.set_attribute(
"download_task_artifacts_error", "Errors occured while fetching artifacts"
)
root_logger.error("Errors occured while fetching artifacts")
all_downloaded = False
break
if multiversion_versions:
if not multiversion_downloads:
raise RuntimeError("Multiversion core dumps were found without download links.")
with OtelThreadPoolExecutor() as executor:
futures = []
for version in multiversion_versions:
version_downloads = next(
filter(
lambda actual, desired=version: actual.get("bin_suffix") == desired,
multiversion_downloads,
)
)
version_id = version_downloads["evg_urls_info"]["evg_version_id"]
variant = version_downloads["evg_urls_info"]["evg_build_variant"]
futures.append(
executor.submit(
run_with_retries,
func=download_multiversion_artifact,
timeout_secs=download_timeout_secs,
retry_secs=retry_secs,
root_logger=root_logger,
version_id=version_id,
variant=variant,
download_options=binary_download_options,
download_dir=multiversion_dir,
name=f"binaries-{version}",
bin_version=version,
)
)
futures.append(
executor.submit(
run_with_retries,
func=download_multiversion_artifact,
timeout_secs=download_timeout_secs,
retry_secs=retry_secs,
root_logger=root_logger,
version_id=version_id,
variant=variant,
download_options=debugsymbols_download_options,
download_dir=multiversion_dir,
name=f"debugsymbols-{version}",
bin_version=version,
)
)
for future in concurrent.futures.as_completed(futures):
if not future.result():
current_span.set_status(
StatusCode.ERROR, "Errors occured while fetching old version artifacts"
)
current_span.set_attribute(
"download_task_artifacts_error",
"Errors occured while fetching old version artifacts",
)
root_logger.error("Errors occured while fetching old version artifacts")
all_downloaded = False
break
if all_downloaded and sys.platform.startswith("linux"):
post_install_gdb_optimization(download_dir, root_logger)
for version in multiversion_versions:
post_install_gdb_optimization(os.path.join(multiversion_dir, version), root_logger)
return all_downloaded
def download_debug_symbols(
root_logger, symbolizer: Symbolizer, retry_secs: int = 10, download_timeout_secs: int = 10 * 60
):
"""
Extract debug symbols. Idempotent.
:param root_logger: logger to use
:param symbolizer: pre-configured instance of symbolizer for downloading symbols.
:param retry_secs: seconds before retrying to download symbols
:param download_timeout_secs: timeout in seconds before failing to download
:return: None
"""
# Check if the files are already there. They would be on *SAN builds.
sym_files = _get_symbol_files()
if len(sym_files) >= len(_DEBUG_FILE_BASE_NAMES):
root_logger.info(
"Skipping downloading debug symbols as there are already symbol files present"
)
return
while True:
try:
symbolizer.execute()
root_logger.info("Debug symbols successfully downloaded")
break
except (tarfile.ReadError, DownloadError):
root_logger.warn(
"Debug symbols unavailable after %s secs, retrying in %s secs, waiting for a total of %s secs",
compare_start_time(time.time()),
retry_secs,
download_timeout_secs,
)
time.sleep(retry_secs)
if compare_start_time(time.time()) > download_timeout_secs:
root_logger.warn(
"Debug-symbols archive-file does not exist after %s secs; "
"Hang-Analyzer may not complete successfully.",
download_timeout_secs,
)
break
def _get_symbol_files():
out = []
for ext in ["debug", "dSYM", "pdb"]:
for file in _DEBUG_FILE_BASE_NAMES:
haystack = build_hygienic_bin_path(child="{file}.{ext}".format(file=file, ext=ext))
for needle in glob.glob(haystack):
out.append((needle, os.path.join(os.getcwd(), os.path.basename(needle))))
return out
def check_manifest_for_cores(root_logger: Logger, manifest_url: str) -> bool:
"""Check if a test.outputs manifest contains core dumps."""
try:
with urllib.request.urlopen(manifest_url) as response:
manifest_content = response.read().decode("utf-8")
has_cores = ".core" in manifest_content or ".mdmp" in manifest_content
if has_cores:
root_logger.info("Manifest indicates core dumps are present")
else:
root_logger.info("Manifest indicates no core dumps")
return has_cores
except Exception as ex:
root_logger.warning(
f"Could not read manifest: {ex}. Will download entire test outputs and check for cores."
)
return True # If we can't read the manifest, assume cores might be present
@TRACER.start_as_current_span("core_analyzer.download_bazel_result_task_cores")
def download_bazel_result_task_cores(root_logger: Logger, task_id: str, download_dir: str) -> bool:
root_logger.info(f"Downloading cores from task {task_id}")
current_span = get_default_current_span({"download_task_id": task_id})
evg_api = evergreen_conn.get_evergreen_api()
task_info = evg_api.task_by_id(task_id)
core_dumps_dir = os.path.join(download_dir, "core-dumps")
os.makedirs(core_dumps_dir, exist_ok=True)
outputs_artifacts = []
manifest_map = {} # Map of outputs zip archive to its manifest
for artifact in task_info.artifacts:
if "test.outputs" in artifact.name and artifact.name.endswith(".zip"):
outputs_artifacts.append(artifact)
elif "_manifest__MANIFEST" in artifact.name:
manifest_map[artifact.name.replace("_manifest__MANIFEST", "__outputs.zip")] = (
artifact.url
)
if not outputs_artifacts:
root_logger.warning("No test.outputs artifacts found in result task")
return False
core_dumps_found = 0
for artifact in outputs_artifacts:
root_logger.info(f"Processing artifact: {artifact.name}")
# Check manifest first to see if cores are present
manifest_url = manifest_map.get(artifact.name)
if manifest_url:
if not check_manifest_for_cores(root_logger, manifest_url):
root_logger.info(f"Skipping {artifact.name} - no cores in manifest")
continue
else:
root_logger.warning(f"No manifest found for {artifact.name}, will download anyway")
file_name = artifact.name
zip_path = os.path.join(download_dir, file_name)
try:
@retry(tries=3, delay=5)
def download_outputs_zip():
root_logger.info(f"Downloading {file_name}")
if os.path.exists(zip_path):
os.remove(zip_path)
download_from_s3_with_requests(artifact.url, zip_path)
download_outputs_zip()
with zipfile.ZipFile(zip_path, "r") as zip_ref:
for member in zip_ref.namelist():
if member.endswith(".core") or member.endswith(".mdmp"):
core_name = os.path.basename(member)
extract_path = os.path.join(core_dumps_dir, core_name)
root_logger.info(f"Extracting core dump: {core_name}")
with zip_ref.open(member) as source, open(extract_path, "wb") as target:
shutil.copyfileobj(source, target)
core_dumps_found += 1
os.remove(zip_path)
except Exception as ex:
root_logger.error(f"Error processing artifact {artifact.name}: {ex}")
current_span.set_status(
StatusCode.ERROR, f"Failed to download artifact {artifact.name}"
)
current_span.set_attribute("download_error", str(ex))
root_logger.info(f"Downloaded {core_dumps_found} core dump(s)")
if core_dumps_found == 0:
root_logger.error("No core dumps found in test.outputs")
current_span.set_status(StatusCode.ERROR, "No core dumps found")
return False
current_span.set_attribute("core_dumps_found", core_dumps_found)
return True
def find_test_task_with_binaries(evg_api, results_task_id: str):
"""
Find the test task in the same build that has dist-tests artifacts.
:param evg_api: Evergreen API client
:param results_task_id:
:return: Task ID of the resmoke_tests task, or None if not found
"""
try:
task = evg_api.task_by_id(results_task_id)
tasks = evg_api.tasks_by_build(task.build_id)
if "_burn_in_" in task.display_name:
resmoke_tests_task = list(
filter(lambda t: t.display_name.startswith("resmoke_tests_burn_in"), tasks)
)
else:
resmoke_tests_task = list(filter(lambda t: t.display_name == "resmoke_tests", tasks))
assert (
len(resmoke_tests_task) == 1
), f"Could not find a unique resmoke test task in this variant {task.build_variant_display_name}"
return resmoke_tests_task[0]
except Exception as ex:
print(f"ERROR: Failed to query Evergreen for test task: {ex}")
return None
@TRACER.start_as_current_span("core_analyzer.download_bazel_test_task_binaries")
def download_bazel_test_task_binaries(root_logger: Logger, task_id: str, download_dir: str) -> bool:
evg_api = evergreen_conn.get_evergreen_api()
resmoke_task = find_test_task_with_binaries(evg_api, task_id)
root_logger.info(f"Downloading binaries from task {resmoke_task.task_id}")
dist_tests_artifacts = [
a for a in resmoke_task.artifacts if "Test binaries and libraries" in a.name
]
if not dist_tests_artifacts:
root_logger.error("No binary archive found in the resmoke_test task")
return False
install_dir = os.path.join(download_dir, "install")
os.makedirs(install_dir, exist_ok=True)
for artifact in dist_tests_artifacts:
file_name = "resmoke_tests.tgz"
download_path = os.path.join(download_dir, file_name)
try:
@retry(tries=3, delay=5)
def download_binary_artifact():
root_logger.info(f"Downloading {file_name}")
if os.path.exists(download_path):
os.remove(download_path)
download_from_s3_with_requests(artifact.url, download_path)
download_binary_artifact()
root_logger.info(f"Extracting {file_name}")
with tarfile.open(download_path, "r:gz") as tar:
# Extract members, mapping dist-tests -> dist-test
for member in tar.getmembers():
if member.name.startswith("dist-tests/"):
member.name = member.name.replace("dist-tests/", "dist-test/", 1)
tar.extract(member, install_dir)
os.remove(download_path)
except Exception as ex:
root_logger.error(f"Error downloading/extracting {file_name}: {ex}")
return False
root_logger.info("Successfully downloaded and extracted binaries")
return True
@TRACER.start_as_current_span("core_analyzer.download_bazel_task_artifacts")
def download_bazel_task_artifacts(
root_logger: Logger,
task_id: str,
download_dir: str,
retry_secs: int = 10,
download_timeout_secs: int = 30 * 60,
) -> bool:
if os.path.exists(download_dir):
# quick sanity check to ensure we don't delete a repo
if os.path.exists(os.path.join(download_dir, ".git")):
raise RuntimeError(f"Input dir cannot be a git repo: {download_dir}")
shutil.rmtree(download_dir)
root_logger.info(f"Deleted existing dir at {download_dir}")
os.mkdir(download_dir)
current_span = get_default_current_span({"task_id": task_id})
# Download cores and binaries in parallel
with OtelThreadPoolExecutor() as executor:
futures = []
# Download core dumps from result task
futures.append(
executor.submit(
run_with_retries,
root_logger=root_logger,
func=download_bazel_result_task_cores,
timeout_secs=download_timeout_secs,
retry_secs=retry_secs,
task_id=task_id,
download_dir=download_dir,
)
)
# Download binaries from test task
futures.append(
executor.submit(
run_with_retries,
root_logger=root_logger,
func=download_bazel_test_task_binaries,
timeout_secs=download_timeout_secs,
retry_secs=retry_secs,
task_id=task_id,
download_dir=download_dir,
)
)
all_downloaded = True
for future in concurrent.futures.as_completed(futures):
if not future.result():
current_span.set_status(
StatusCode.ERROR, "Errors occurred while fetching artifacts"
)
current_span.set_attribute(
"download_bazel_task_artifacts_error",
"Errors occurred while fetching artifacts",
)
root_logger.error("Errors occurred while fetching bazel artifacts")
all_downloaded = False
break
if all_downloaded and sys.platform.startswith("linux"):
sysroot = os.path.join(download_dir, "rbe_sysroot")
create_rbe_sysroot(sysroot)
post_install_gdb_optimization(download_dir, root_logger)
return all_downloaded, sysroot