SERVER-106566 Ingest telemetry from resmoke remote test executions (#46144)
GitOrigin-RevId: 7b1add9325366a78afa1d1db850faab60b092c7e
This commit is contained in:
@@ -39,14 +39,16 @@ def add_evergreen_build_info(args):
|
||||
add_volatile_arg(args, "--buildId=", "build_id")
|
||||
add_volatile_arg(args, "--distroId=", "distro_id")
|
||||
add_volatile_arg(args, "--executionNumber=", "execution")
|
||||
add_volatile_arg(args, "--projectName=", "project")
|
||||
add_volatile_arg(args, "--gitRevision=", "revision")
|
||||
add_volatile_arg(args, "--otelParentId=", "otel_parent_id")
|
||||
add_volatile_arg(args, "--otelTraceId=", "otel_trace_id")
|
||||
add_volatile_arg(args, "--projectName=", "project")
|
||||
add_volatile_arg(args, "--requester=", "requester")
|
||||
add_volatile_arg(args, "--revisionOrderId=", "revision_order_id")
|
||||
add_volatile_arg(args, "--taskId=", "task_id")
|
||||
add_volatile_arg(args, "--taskName=", "task_name")
|
||||
add_volatile_arg(args, "--variantName=", "build_variant")
|
||||
add_volatile_arg(args, "--versionId=", "version_id")
|
||||
add_volatile_arg(args, "--requester=", "requester")
|
||||
|
||||
|
||||
class ResmokeShimContext:
|
||||
|
||||
@@ -93,16 +93,18 @@ def print_test_runtimes_status():
|
||||
def print_evergreen_expansions():
|
||||
for expansion in [
|
||||
"build_id",
|
||||
"build_variant",
|
||||
"distro_id",
|
||||
"execution",
|
||||
"otel_parent_id",
|
||||
"otel_trace_id",
|
||||
"project",
|
||||
"requester",
|
||||
"revision",
|
||||
"revision_order_id",
|
||||
"task_id",
|
||||
"task_name",
|
||||
"build_variant",
|
||||
"version_id",
|
||||
"requester",
|
||||
]:
|
||||
value = os.environ.get(expansion, "")
|
||||
if value:
|
||||
|
||||
@@ -16,7 +16,7 @@ import textwrap
|
||||
import traceback
|
||||
from functools import cache
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import git
|
||||
import pymongo.uri_parser
|
||||
@@ -34,6 +34,7 @@ from buildscripts.resmokelib.run import TestRunner
|
||||
from buildscripts.resmokelib.utils import autoloader
|
||||
from buildscripts.resmokelib.utils.batched_baggage_span_processor import BatchedBaggageSpanProcessor
|
||||
from buildscripts.resmokelib.utils.file_span_exporter import FileSpanExporter
|
||||
from buildscripts.resmokelib.utils.otel_id_generator import ResmokeOtelIdGenerator
|
||||
from buildscripts.util.read_config import read_config_file
|
||||
from buildscripts.util.taskname import determine_task_base_name
|
||||
from buildscripts.util.teststats import HistoricTaskData
|
||||
@@ -286,6 +287,8 @@ def _set_up_tracing(
|
||||
trace_id: Optional[str],
|
||||
parent_span_id: Optional[str],
|
||||
extra_context: Dict[str, object],
|
||||
suite_files: Optional[List[str]] = None,
|
||||
shard_index: Optional[int] = None,
|
||||
) -> bool:
|
||||
"""Try to set up otel tracing. On success return True. On failure return False.
|
||||
|
||||
@@ -306,7 +309,9 @@ def _set_up_tracing(
|
||||
# Service name is required for most backends
|
||||
resource = Resource(attributes={SERVICE_NAME: "resmoke"})
|
||||
|
||||
provider = TracerProvider(resource=resource)
|
||||
# Use custom ID generator to prevent span ID collisions in parallel resmoke invocations
|
||||
id_generator = ResmokeOtelIdGenerator(suite_files=suite_files, shard_index=shard_index)
|
||||
provider = TracerProvider(resource=resource, id_generator=id_generator)
|
||||
if otel_collector_dir:
|
||||
try:
|
||||
otel_collector_dir = Path(otel_collector_dir)
|
||||
@@ -861,6 +866,8 @@ flags in common: {common_set}
|
||||
_config.OTEL_TRACE_ID,
|
||||
_config.OTEL_PARENT_ID,
|
||||
extra_context=extra_context,
|
||||
suite_files=_config.SUITE_FILES,
|
||||
shard_index=_config.SHARD_INDEX,
|
||||
)
|
||||
if not setup_success:
|
||||
print(
|
||||
|
||||
@@ -16,6 +16,7 @@ py_library(
|
||||
"globstar.py",
|
||||
"history.py",
|
||||
"jscomment.py",
|
||||
"otel_id_generator.py",
|
||||
"otel_thread_pool_executor.py",
|
||||
"otel_utils.py",
|
||||
"queue.py",
|
||||
|
||||
58
buildscripts/resmokelib/utils/otel_id_generator.py
Normal file
58
buildscripts/resmokelib/utils/otel_id_generator.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import random
|
||||
import time
|
||||
from typing import List, Optional
|
||||
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.sdk.trace import IdGenerator
|
||||
|
||||
|
||||
class ResmokeOtelIdGenerator(IdGenerator):
|
||||
"""
|
||||
ID generator that creates unique span IDs across parallel resmoke.py invocations.
|
||||
|
||||
This generator seeds Python's random module with a combination of:
|
||||
- Timestamp
|
||||
- Optional list of suites being run
|
||||
- Optional shard index (unique per parallel resmoke shard)
|
||||
|
||||
This helps prevents ID collisions when multiple resmoke.py processes are
|
||||
run in parallel with the same traceID and parentSpanID.
|
||||
"""
|
||||
|
||||
def __init__(self, suite_files: Optional[List[str]] = None, shard_index: Optional[int] = None):
|
||||
"""
|
||||
Initialize the unique span ID generator.
|
||||
|
||||
Args:
|
||||
suite_files: Optional list of suites
|
||||
shard_index: Optional shard index to incorporate into the seed
|
||||
"""
|
||||
|
||||
seed_parts = [
|
||||
int(time.time() * 1_000),
|
||||
]
|
||||
|
||||
if suite_files is not None:
|
||||
seed_parts.append("".join(suite_files))
|
||||
|
||||
if shard_index is not None:
|
||||
seed_parts.append(shard_index)
|
||||
|
||||
seed = hash(tuple(seed_parts))
|
||||
|
||||
# Create a separate Random instance to avoid interfering with other uses of random
|
||||
self._rng = random.Random(seed)
|
||||
|
||||
def generate_span_id(self) -> int:
|
||||
"""Generate a unique 64-bit span ID."""
|
||||
span_id = self._rng.getrandbits(64)
|
||||
while span_id == trace.INVALID_SPAN_ID:
|
||||
span_id = self._rng.getrandbits(64)
|
||||
return span_id
|
||||
|
||||
def generate_trace_id(self) -> int:
|
||||
"""Generate a unique 128-bit trace ID."""
|
||||
trace_id = self._rng.getrandbits(128)
|
||||
while trace_id == trace.INVALID_TRACE_ID:
|
||||
trace_id = self._rng.getrandbits(128)
|
||||
return trace_id
|
||||
@@ -1292,16 +1292,18 @@ functions:
|
||||
binary: bash
|
||||
env:
|
||||
build_id: ${build_id}
|
||||
build_variant: ${build_variant}
|
||||
distro_id: ${distro_id}
|
||||
execution: ${execution}
|
||||
otel_parent_id: ${otel_parent_id}
|
||||
otel_trace_id: ${otel_trace_id}
|
||||
project: ${project}
|
||||
requester: ${requester}
|
||||
revision: ${revision}
|
||||
revision_order_id: ${revision_order_id}
|
||||
task_id: ${task_id}
|
||||
task_name: ${task_name}
|
||||
build_variant: ${build_variant}
|
||||
version_id: ${version_id}
|
||||
requester: ${requester}
|
||||
args:
|
||||
- "./src/evergreen/resmoke_tests_execute_bazel.sh"
|
||||
|
||||
|
||||
@@ -108,6 +108,45 @@ function symlink_test_logs() {
|
||||
done
|
||||
}
|
||||
|
||||
# Combine all resmoke telemetry and place it where Evergreen expects it: ${workdir}/build/OTelTraces.
|
||||
# Metrics are batched into line-separated JSON files no greater than 4MB each. Evergreen processes
|
||||
# fewer files faster, but hits message size limitations if they are too large.
|
||||
function combine_metrics() {
|
||||
local output_dir="${workdir}/build/OTelTraces"
|
||||
mkdir -p "$output_dir"
|
||||
|
||||
local max_size=$((4 * 1024 * 1024)) # 4MB in bytes
|
||||
local file_counter=0
|
||||
local current_size=0
|
||||
local current_output="${output_dir}/metrics.json"
|
||||
|
||||
# Create initial empty file
|
||||
>"$current_output"
|
||||
|
||||
find "${workdir}/results" -wholename '*metrics/metrics*.json' -type f -print0 | while IFS= read -r -d '' file; do
|
||||
local file_size=$(stat -c%s "$file")
|
||||
local newline_size=1
|
||||
|
||||
# Check if adding this file would exceed the limit
|
||||
if ((current_size + file_size + newline_size > max_size && current_size > 0)); then
|
||||
# Start a new file
|
||||
((file_counter++))
|
||||
current_output="${output_dir}/metrics_${file_counter}.json"
|
||||
current_size=0
|
||||
>"$current_output"
|
||||
fi
|
||||
|
||||
# Append the file content
|
||||
cat "$file" >>"$current_output"
|
||||
echo "" >>"$current_output" # Adds a single newline after each file's content
|
||||
|
||||
# Update current size
|
||||
current_size=$((current_size + file_size + newline_size))
|
||||
done
|
||||
|
||||
echo 'Combined OTel metrics json'
|
||||
}
|
||||
|
||||
# Combines all Resmoke test report JSONs into a single JSON.
|
||||
function combine_reports() {
|
||||
local report_files=$(find "${workdir}" -name 'report*.json' -type f 2>/dev/null)
|
||||
@@ -187,6 +226,8 @@ while IFS= read -r test_result; do
|
||||
popd >/dev/null
|
||||
done < <(enumerate_test_results)
|
||||
|
||||
combine_metrics
|
||||
|
||||
failures=$(combine_reports)
|
||||
|
||||
write_bazel_invocation
|
||||
|
||||
Reference in New Issue
Block a user