SERVER-106566 Ingest telemetry from resmoke remote test executions (#46144)

GitOrigin-RevId: 7b1add9325366a78afa1d1db850faab60b092c7e
This commit is contained in:
Sean Lyons
2026-01-12 09:23:36 -05:00
committed by MongoDB Bot
parent b8f29f350a
commit 7d928c3c1b
7 changed files with 121 additions and 8 deletions

View File

@@ -39,14 +39,16 @@ def add_evergreen_build_info(args):
add_volatile_arg(args, "--buildId=", "build_id")
add_volatile_arg(args, "--distroId=", "distro_id")
add_volatile_arg(args, "--executionNumber=", "execution")
add_volatile_arg(args, "--projectName=", "project")
add_volatile_arg(args, "--gitRevision=", "revision")
add_volatile_arg(args, "--otelParentId=", "otel_parent_id")
add_volatile_arg(args, "--otelTraceId=", "otel_trace_id")
add_volatile_arg(args, "--projectName=", "project")
add_volatile_arg(args, "--requester=", "requester")
add_volatile_arg(args, "--revisionOrderId=", "revision_order_id")
add_volatile_arg(args, "--taskId=", "task_id")
add_volatile_arg(args, "--taskName=", "task_name")
add_volatile_arg(args, "--variantName=", "build_variant")
add_volatile_arg(args, "--versionId=", "version_id")
add_volatile_arg(args, "--requester=", "requester")
class ResmokeShimContext:

View File

@@ -93,16 +93,18 @@ def print_test_runtimes_status():
def print_evergreen_expansions():
for expansion in [
"build_id",
"build_variant",
"distro_id",
"execution",
"otel_parent_id",
"otel_trace_id",
"project",
"requester",
"revision",
"revision_order_id",
"task_id",
"task_name",
"build_variant",
"version_id",
"requester",
]:
value = os.environ.get(expansion, "")
if value:

View File

@@ -16,7 +16,7 @@ import textwrap
import traceback
from functools import cache
from pathlib import Path
from typing import Dict, Optional
from typing import Dict, List, Optional
import git
import pymongo.uri_parser
@@ -34,6 +34,7 @@ from buildscripts.resmokelib.run import TestRunner
from buildscripts.resmokelib.utils import autoloader
from buildscripts.resmokelib.utils.batched_baggage_span_processor import BatchedBaggageSpanProcessor
from buildscripts.resmokelib.utils.file_span_exporter import FileSpanExporter
from buildscripts.resmokelib.utils.otel_id_generator import ResmokeOtelIdGenerator
from buildscripts.util.read_config import read_config_file
from buildscripts.util.taskname import determine_task_base_name
from buildscripts.util.teststats import HistoricTaskData
@@ -286,6 +287,8 @@ def _set_up_tracing(
trace_id: Optional[str],
parent_span_id: Optional[str],
extra_context: Dict[str, object],
suite_files: Optional[List[str]] = None,
shard_index: Optional[int] = None,
) -> bool:
"""Try to set up otel tracing. On success return True. On failure return False.
@@ -306,7 +309,9 @@ def _set_up_tracing(
# Service name is required for most backends
resource = Resource(attributes={SERVICE_NAME: "resmoke"})
provider = TracerProvider(resource=resource)
# Use custom ID generator to prevent span ID collisions in parallel resmoke invocations
id_generator = ResmokeOtelIdGenerator(suite_files=suite_files, shard_index=shard_index)
provider = TracerProvider(resource=resource, id_generator=id_generator)
if otel_collector_dir:
try:
otel_collector_dir = Path(otel_collector_dir)
@@ -861,6 +866,8 @@ flags in common: {common_set}
_config.OTEL_TRACE_ID,
_config.OTEL_PARENT_ID,
extra_context=extra_context,
suite_files=_config.SUITE_FILES,
shard_index=_config.SHARD_INDEX,
)
if not setup_success:
print(

View File

@@ -16,6 +16,7 @@ py_library(
"globstar.py",
"history.py",
"jscomment.py",
"otel_id_generator.py",
"otel_thread_pool_executor.py",
"otel_utils.py",
"queue.py",

View File

@@ -0,0 +1,58 @@
import random
import time
from typing import List, Optional
from opentelemetry import trace
from opentelemetry.sdk.trace import IdGenerator
class ResmokeOtelIdGenerator(IdGenerator):
"""
ID generator that creates unique span IDs across parallel resmoke.py invocations.
This generator seeds Python's random module with a combination of:
- Timestamp
- Optional list of suites being run
- Optional shard index (unique per parallel resmoke shard)
This helps prevents ID collisions when multiple resmoke.py processes are
run in parallel with the same traceID and parentSpanID.
"""
def __init__(self, suite_files: Optional[List[str]] = None, shard_index: Optional[int] = None):
"""
Initialize the unique span ID generator.
Args:
suite_files: Optional list of suites
shard_index: Optional shard index to incorporate into the seed
"""
seed_parts = [
int(time.time() * 1_000),
]
if suite_files is not None:
seed_parts.append("".join(suite_files))
if shard_index is not None:
seed_parts.append(shard_index)
seed = hash(tuple(seed_parts))
# Create a separate Random instance to avoid interfering with other uses of random
self._rng = random.Random(seed)
def generate_span_id(self) -> int:
"""Generate a unique 64-bit span ID."""
span_id = self._rng.getrandbits(64)
while span_id == trace.INVALID_SPAN_ID:
span_id = self._rng.getrandbits(64)
return span_id
def generate_trace_id(self) -> int:
"""Generate a unique 128-bit trace ID."""
trace_id = self._rng.getrandbits(128)
while trace_id == trace.INVALID_TRACE_ID:
trace_id = self._rng.getrandbits(128)
return trace_id

View File

@@ -1292,16 +1292,18 @@ functions:
binary: bash
env:
build_id: ${build_id}
build_variant: ${build_variant}
distro_id: ${distro_id}
execution: ${execution}
otel_parent_id: ${otel_parent_id}
otel_trace_id: ${otel_trace_id}
project: ${project}
requester: ${requester}
revision: ${revision}
revision_order_id: ${revision_order_id}
task_id: ${task_id}
task_name: ${task_name}
build_variant: ${build_variant}
version_id: ${version_id}
requester: ${requester}
args:
- "./src/evergreen/resmoke_tests_execute_bazel.sh"

View File

@@ -108,6 +108,45 @@ function symlink_test_logs() {
done
}
# Combine all resmoke telemetry and place it where Evergreen expects it: ${workdir}/build/OTelTraces.
# Metrics are batched into line-separated JSON files no greater than 4MB each. Evergreen processes
# fewer files faster, but hits message size limitations if they are too large.
function combine_metrics() {
local output_dir="${workdir}/build/OTelTraces"
mkdir -p "$output_dir"
local max_size=$((4 * 1024 * 1024)) # 4MB in bytes
local file_counter=0
local current_size=0
local current_output="${output_dir}/metrics.json"
# Create initial empty file
>"$current_output"
find "${workdir}/results" -wholename '*metrics/metrics*.json' -type f -print0 | while IFS= read -r -d '' file; do
local file_size=$(stat -c%s "$file")
local newline_size=1
# Check if adding this file would exceed the limit
if ((current_size + file_size + newline_size > max_size && current_size > 0)); then
# Start a new file
((file_counter++))
current_output="${output_dir}/metrics_${file_counter}.json"
current_size=0
>"$current_output"
fi
# Append the file content
cat "$file" >>"$current_output"
echo "" >>"$current_output" # Adds a single newline after each file's content
# Update current size
current_size=$((current_size + file_size + newline_size))
done
echo 'Combined OTel metrics json'
}
# Combines all Resmoke test report JSONs into a single JSON.
function combine_reports() {
local report_files=$(find "${workdir}" -name 'report*.json' -type f 2>/dev/null)
@@ -187,6 +226,8 @@ while IFS= read -r test_result; do
popd >/dev/null
done < <(enumerate_test_results)
combine_metrics
failures=$(combine_reports)
write_bazel_invocation