Files
mongo/buildscripts/resmokelib/testing/hooks/combine_benchmark_results.py
2022-03-28 09:14:20 +00:00

314 lines
11 KiB
Python

"""Module for generating the test results file fed into the perf plugin."""
import collections
import datetime
import json
from typing import List, Dict, Any
from buildscripts.resmokelib import config as _config
from buildscripts.resmokelib.errors import CedarReportError
from buildscripts.resmokelib.testing.hooks import interface
from buildscripts.util.cedar_report import CedarMetric, CedarTestReport
class CombineBenchmarkResults(interface.Hook):
"""CombineBenchmarkResults class.
The CombineBenchmarkResults hook combines test results from
individual benchmark files to a single file. This is useful for
generating the json file to feed into the Evergreen performance
visualization plugin.
"""
DESCRIPTION = "Combine JSON results from individual benchmarks"
IS_BACKGROUND = False
def __init__(self, hook_logger, fixture):
"""Initialize CombineBenchmarkResults."""
interface.Hook.__init__(self, hook_logger, fixture, CombineBenchmarkResults.DESCRIPTION)
self.legacy_report_file = _config.PERF_REPORT_FILE
self.cedar_report_file = _config.CEDAR_REPORT_FILE
# Reports grouped by name without thread.
self.benchmark_reports = {}
self.create_time = None
self.end_time = None
@staticmethod
def _strftime(time):
return time.strftime("%Y-%m-%dT%H:%M:%SZ")
def after_test(self, test, test_report):
"""Update test report."""
if self.legacy_report_file is None:
return
bm_report_path = test.report_name()
with open(bm_report_path, "r") as bm_report_file:
bm_report_dict = json.load(bm_report_file)
self._parse_report(bm_report_dict)
def before_suite(self, test_report):
"""Set suite start time."""
self.create_time = datetime.datetime.now()
def after_suite(self, test_report, teardown_flag=None):
"""Update test report."""
if self.legacy_report_file is None:
return
self.end_time = datetime.datetime.now()
legacy_report = self._generate_perf_plugin_report()
with open(self.legacy_report_file, "w") as fh:
json.dump(legacy_report, fh)
try:
cedar_report = self._generate_cedar_report()
except CedarReportError:
teardown_flag.set()
raise
else:
with open(self.cedar_report_file, "w") as fh:
json.dump(cedar_report, fh)
def _generate_perf_plugin_report(self):
"""Format the data to look like a perf plugin report."""
perf_report = {
"start": self._strftime(self.create_time),
"end": self._strftime(self.end_time),
"errors": [], # There are no errors if we have gotten this far.
"results": []
}
for name, report in list(self.benchmark_reports.items()):
test_report = {
"name": name, "context": report.context._asdict(),
"results": report.generate_perf_plugin_dict()
}
perf_report["results"].append(test_report)
return perf_report
def _generate_cedar_report(self) -> List[dict]:
"""Format the data to look like a cedar report."""
cedar_report = []
for name, report in self.benchmark_reports.items():
cedar_metrics = report.generate_cedar_metrics()
for _, thread_metrics in cedar_metrics.items():
if report.check_dup_metric_names(thread_metrics):
msg = f"The test '{name}' has duplicated metric names."
raise CedarReportError(msg)
for threads_count, thread_metrics in cedar_metrics.items():
test_report = CedarTestReport(test_name=name, thread_level=threads_count,
metrics=thread_metrics)
cedar_report.append(test_report.as_dict())
return cedar_report
def _parse_report(self, report_dict):
context = report_dict["context"]
for benchmark_res in report_dict["benchmarks"]:
bm_name_obj = _BenchmarkThreadsReport.parse_bm_name(benchmark_res)
if bm_name_obj.base_name not in self.benchmark_reports:
self.benchmark_reports[bm_name_obj.base_name] = _BenchmarkThreadsReport(context)
self.benchmark_reports[bm_name_obj.base_name].add_report(bm_name_obj, benchmark_res)
# Capture information from a Benchmark name in a logical format.
_BenchmarkName = collections.namedtuple("_BenchmarkName",
["base_name", "thread_count", "statistic_type"])
class _BenchmarkThreadsReport(object):
"""_BenchmarkThreadsReport class.
Class representation of a report for all thread levels of a single
benchmark test. Each report is designed to correspond to one graph
in the Evergreen perf plugin.
A raw Benchmark report looks like the following:
{
"context": {
"date": "2015/03/17-18:40:25",
"executable": "./build/opt/mongo/db/concurrency/lock_manager_bm"
"num_cpus": 40,
"mhz_per_cpu": 2801,
"cpu_scaling_enabled": false,
"caches": [
],
"library_build_type": "debug"
},
"benchmarks": [
{
"name": "BM_SetInsert/arg name:1024/threads:10",
"iterations": 21393,
"real_time": 32724,
"cpu_time": 33355,
"bytes_per_second": 1199226,
"items_per_second": 299807
}
]
}
"""
DEFAULT_CEDAR_METRIC_NAME = "latency_per_op"
# Map benchmark metric type to the type in Cedar
# https://github.com/evergreen-ci/cedar/blob/87e22df45845440cf299d4ee1f406e8c00ff05ae/perf.proto#L101-L115
BENCHMARK_TO_CEDAR_METRIC_TYPE_MAP = {
"latency": "LATENCY",
"mean": "MEAN",
"median": "MEDIAN",
"stddev": "STANDARD_DEVIATION",
}
CONTEXT_FIELDS = [
"date",
"num_cpus",
"mhz_per_cpu",
"library_build_type",
"executable",
"caches",
"cpu_scaling_enabled",
]
Context = collections.namedtuple(
typename="Context",
field_names=CONTEXT_FIELDS,
# We need a default for cpu_scaling_enabled, since newer
# google benchmark doesn't report a value if it can't make a
# determination.
defaults=["unknown"],
) # type: ignore
def __init__(self, context_dict):
# `context_dict` was parsed from a json file and might have additional fields.
relevant = dict(filter(lambda e: e[0] in self.Context._fields, context_dict.items()))
self.context = self.Context(**relevant)
# list of benchmark runs for each thread.
self.thread_benchmark_map = collections.defaultdict(list)
def add_report(self, bm_name_obj, report):
"""Add to report."""
self.thread_benchmark_map[bm_name_obj.thread_count].append(report)
def generate_perf_plugin_dict(self):
"""Generate perf plugin data points of the following format.
"1": {
"error_values": [
0,
0,
0
],
"ops_per_sec": 9552.108279243452,
"ops_per_sec_values": [
9574.812658450564,
9522.642340821469,
9536.252775275878
]
},
"""
res = {}
for thread_count, reports in list(self.thread_benchmark_map.items()):
thread_report = {
"error_values": [],
"ops_per_sec_values": [], # This is actually storing latency per op, not ops/s
}
for report in reports:
# Don't show Benchmark's included statistics to prevent cluttering up the graph.
if report.get("run_type") == "aggregate":
continue
thread_report["error_values"].append(0)
# Take the negative of the latency numbers to preserve the higher is better semantics.
thread_report["ops_per_sec_values"].append(-1 * report["cpu_time"])
thread_report["ops_per_sec"] = sum(thread_report["ops_per_sec_values"]) / len(
thread_report["ops_per_sec_values"])
res[thread_count] = thread_report
return res
def generate_cedar_metrics(self) -> Dict[int, List[CedarMetric]]:
"""Generate metrics for Cedar."""
res = {}
for _, reports in self.thread_benchmark_map.items():
for report in reports:
aggregate_name = report.get("aggregate_name", "latency")
if aggregate_name == "latency":
idx = report.get("repetition_index", 0)
metric_name = f"{self.DEFAULT_CEDAR_METRIC_NAME}_{idx}"
else:
metric_name = f"{self.DEFAULT_CEDAR_METRIC_NAME}_{aggregate_name}"
metric_type = self.BENCHMARK_TO_CEDAR_METRIC_TYPE_MAP[aggregate_name]
metric = CedarMetric(name=metric_name, type=metric_type, value=report["cpu_time"])
threads = report["threads"]
if threads in res:
res[threads].append(metric)
else:
res[threads] = [metric]
return res
@staticmethod
def check_dup_metric_names(metrics: List[CedarMetric]) -> bool:
"""Check duplicated metric names for Cedar."""
names = []
for metric in metrics:
if metric.name in names:
return True
names.append(metric.name)
return False
@staticmethod
def parse_bm_name(benchmark_res: Dict[str, Any]):
"""
Split the benchmark name into base_name, thread_count and statistic_type.
The base name is the benchmark name minus the thread count and any statistics.
Testcases of the same group will be shown on a single perf graph.
benchmark_res["name"] look like the following:
"BM_SetInsert/arg name:1024/threads:10_mean"
"BM_SetInsert/arg 1/arg 2"
"BM_SetInsert_mean"
"""
name_str = benchmark_res["name"]
base_name = None
thread_count = None
statistic_type = benchmark_res.get("aggregate_name", None)
# Step 1: get the statistic type.
statistic_type_candidate = name_str.rsplit("_", 1)[-1]
# Remove the statistic type suffix from the name.
if statistic_type_candidate == statistic_type:
name_str = name_str[:-len(statistic_type) - 1]
# Step 2: Get the thread count and name.
thread_section = name_str.rsplit("/", 1)[-1]
if thread_section.startswith("threads:"):
base_name = name_str.rsplit("/", 1)[0]
thread_count = thread_section.split(":")[-1]
else: # There is no explicit thread count, so the thread count is 1.
thread_count = "1"
base_name = name_str
return _BenchmarkName(base_name, thread_count, statistic_type)