import argparse import hashlib import json import os import subprocess import sys import time from collections import deque from pathlib import Path sys.path.append(".") from buildscripts.install_bazel import install_bazel, install_buildozer from buildscripts.simple_report import make_report, put_report, try_combine_reports groups_sort_keys = { "first": 1, "second": 2, "third": 3, "fourth": 4, "fifth": 5, "sixth": 6, "seventh": 7, "eighth": 8, } def find_group(unittest_paths): groups = { # group1 "0": "first", "1": "first", # group2 "2": "second", "3": "second", # group3 "4": "third", "5": "third", # group4 "6": "fourth", "7": "fourth", # group5 "8": "fifth", "9": "fifth", # group6 "a": "sixth", "b": "sixth", # group7 "c": "seventh", "d": "seventh", # group8 "e": "eighth", "f": "eighth", } group_to_path: dict[str, list[str]] = {} for path in unittest_paths: norm_path = path.replace(":", "/").replace("\\", "/") if norm_path.startswith("//"): norm_path = norm_path[2:] if not norm_path.startswith("src/"): print(f"ERROR: {path} not relative to mongo repo root") sys.exit(1) basename = os.path.basename(norm_path) if basename.startswith("lib"): basename = basename[3:] ext = basename.find(".") if ext != -1: basename = basename[:ext] dirname = os.path.dirname(norm_path) hash_path = os.path.join(dirname, basename).replace("\\", "/") first_char = hashlib.sha256(hash_path.encode()).hexdigest()[0] group = groups[first_char] if group not in group_to_path: group_to_path[group] = [] group_to_path[group].append(path) return json.dumps(group_to_path, indent=4) def find_multiple_groups(test, groups): tagged_groups = [] for group in groups: if test in groups[group]: tagged_groups.append(group) return tagged_groups def iter_clang_tidy_files(root: str | Path) -> list[Path]: """Return a list of repo-relative Paths to '.clang-tidy' files. - Uses os.scandir for speed - Does NOT follow symlinks """ root = Path(root).resolve() results: list[Path] = [] stack = deque([root]) while stack: current = stack.pop() try: with os.scandir(current) as it: for entry in it: name = entry.name if entry.is_dir(follow_symlinks=False): stack.append(Path(entry.path)) elif entry.is_file(follow_symlinks=False) and name == ".clang-tidy": # repo-relative path results.append(Path(entry.path).resolve().relative_to(root)) except PermissionError: continue return results def validate_clang_tidy_configs(generate_report, fix): buildozer = install_buildozer() mongo_dir = "src/mongo" tidy_files = iter_clang_tidy_files("src/mongo") p = subprocess.run( [buildozer, "print label srcs", "//:clang_tidy_config_files"], capture_output=True, text=True, ) tidy_targets = None for line in p.stdout.splitlines(): if line.startswith("//") and line.endswith("]"): tokens = line.split("[") tidy_targets = tokens[1][:-1].split(" ") break if tidy_targets is None: print(p.stderr) raise Exception(f"could not parse tidy config targets from '{p.stdout}'") if tidy_targets == [""]: tidy_targets = [] all_targets = [] for tidy_file in tidy_files: tidy_file_target = ( "//" + os.path.dirname(os.path.join(mongo_dir, tidy_file)) + ":clang_tidy_config" ) all_targets.append(tidy_file_target) if all_targets != tidy_targets: msg = f"Incorrect clang tidy config targets: {all_targets} != {tidy_targets}" print(msg) if generate_report: report = make_report("//:clang_tidy_config_files", msg, 1) try_combine_reports(report) put_report(report) if fix: subprocess.run( [buildozer, f"set srcs {' '.join(all_targets)}", "//:clang_tidy_config_files"] ) def validate_bazel_groups(generate_report, fix): buildozer = install_buildozer() bazel_bin = install_bazel(".") query_opts = [ "--implicit_deps=False", "--tool_deps=False", "--include_aspects=False", "--bes_backend=", "--bes_results_url=", ] try: start = time.time() sys.stdout.write("Query all unittest binaries... ") sys.stdout.flush() query_proc = subprocess.run( [ bazel_bin, "query", r'kind(extract_debug, attr(tags, "[\[ ]mongo_unittest[,\]]", //src/...))', ] + query_opts, capture_output=True, text=True, check=True, ) bazel_unittests = query_proc.stdout.splitlines() sys.stdout.write("{:0.2f}s\n".format(time.time() - start)) except subprocess.CalledProcessError as exc: print("BAZEL ERROR:") print(exc.stdout) print(exc.stderr) sys.exit(exc.returncode) buildozer_update_cmds = [] groups = json.loads(find_group(bazel_unittests)) failures = [] for group in sorted(groups, key=lambda x: groups_sort_keys[x]): try: start = time.time() sys.stdout.write(f"Query all mongo_unittest_{group}_group unittests... ") sys.stdout.flush() query_proc = subprocess.run( [ bazel_bin, "query", rf'kind(extract_debug, attr(tags, "[\[ ]mongo_unittest_{group}_group[,\]]", //src/...))', ] + query_opts, capture_output=True, text=True, check=True, ) sys.stdout.write("{:0.2f}s\n".format(time.time() - start)) group_tests = query_proc.stdout.splitlines() except subprocess.CalledProcessError as exc: print("BAZEL ERROR:") print(exc.stdout) print(exc.stderr) sys.exit(exc.returncode) if groups[group] != group_tests: for test in group_tests: if test not in bazel_unittests: failures.append( [ test + " tag", f"{test} not a 'mongo_unittest' but has 'mongo_unittest_{group}_group' tag.", ] ) print(failures[-1][1]) if fix: buildozer_update_cmds += [ [f"remove tags mongo_unittest_{group}_group", test] ] for test in groups[group]: if test not in group_tests: failures.append( [test + " tag", f"{test} missing 'mongo_unittest_{group}_group'"] ) print(failures[-1][1]) if fix: buildozer_update_cmds += [[f"add tags mongo_unittest_{group}_group", test]] for test in group_tests: if test not in groups[group]: failures.append( [ test + " tag", f"{test} is tagged in the wrong group.", ] ) print(failures[-1][1]) if fix: buildozer_update_cmds += [ [f"remove tags mongo_unittest_{group}_group", test] ] if fix: for cmd in buildozer_update_cmds: subprocess.run([buildozer] + cmd) if failures: for failure in failures: if generate_report: report = make_report(failure[0], failure[1], 1) try_combine_reports(report) put_report(report) def validate_idl_naming(generate_report: bool, fix: bool) -> None: """ Enforce: idl_generator( name = "_gen", src = ".idl" | ":gen_target" # where gen_target produces exactly one .idl ) Single `bazel query --output=xml`, parse in-process. Also resolves src labels to generators. """ import xml.etree.ElementTree as ET bazel_bin = install_bazel(".") qopts = [ "--implicit_deps=False", "--tool_deps=False", "--include_aspects=False", "--bes_backend=", "--bes_results_url=", ] # One narrowed query: only rules created by the idl_generator macro try: proc = subprocess.run( [ bazel_bin, "query", "attr(generator_function, idl_generator, //src/...)", "--output=xml", ] + qopts, capture_output=True, text=True, check=True, ) except subprocess.CalledProcessError as exc: print("BAZEL ERROR (narrowed xml):") print(exc.stdout) print(exc.stderr) sys.exit(exc.returncode) root = ET.fromstring(proc.stdout) failures: list[tuple[str, str]] = [] def _val(rule, kind, attr): n = rule.find(f'./{kind}[@name="{attr}"]') return n.get("value") if n is not None else None # Prepass: map rule label -> outputs so we can resolve src labels that generate an .idl outputs_by_rule: dict[str, list[str]] = {} for r in root.findall(".//rule"): rname = r.get("name") if not rname: continue outs = [n.get("name") for n in r.findall("./rule-output") if n.get("name")] outputs_by_rule[rname] = outs for rule in root.findall(".//rule"): # Already narrowed, but keep the sentinel check cheap if _val(rule, "string", "generator_function") != "idl_generator": continue rlabel = rule.get("name") or "" if not (rlabel.startswith("//") and ":" in rlabel): failures.append((rlabel or "", "Malformed idl_generator rule label")) continue pkg, name = rlabel[2:].split(":", 1) # Resolve src from label/string/srcs list src_val = _val(rule, "label", "src") or _val(rule, "string", "src") if not src_val: srcs_vals = [] for lst in rule.findall('./list[@name="srcs"]'): srcs_vals += [n.get("value") for n in lst.findall("./label") if n.get("value")] srcs_vals += [n.get("value") for n in lst.findall("./string") if n.get("value")] if len(srcs_vals) == 1: src_val = srcs_vals[0] else: failures.append( (rlabel, f"'src'/'srcs' must have exactly one entry, got: {srcs_vals}") ) continue src = src_val.replace("\\", "/") src_base: str | None = None if src.startswith("//"): spkg, sname = src[2:].split(":") if spkg != pkg: failures.append((rlabel, f"'src' must be in same package '{pkg}', got '{src}'")) if sname.endswith(".idl"): src_base = os.path.basename(sname) else: idl_outs = [o for o in outputs_by_rule.get(src, []) if o.endswith(".idl")] if len(idl_outs) != 1: failures.append( ( rlabel, f"'src' '{src}' must produce exactly one .idl, got: {idl_outs or outputs_by_rule.get(src, [])}", ) ) continue src_base = os.path.basename(idl_outs[0].split(":", 1)[1]) elif src.startswith(":"): sname = src[1:] if sname.endswith(".idl"): src_base = os.path.basename(sname) else: abs_label = f"//{pkg}:{sname}" idl_outs = [o for o in outputs_by_rule.get(abs_label, []) if o.endswith(".idl")] if len(idl_outs) != 1: failures.append( ( rlabel, f"'src' '{src}' must produce exactly one .idl, got: {idl_outs or outputs_by_rule.get(abs_label, [])}", ) ) continue src_base = os.path.basename(idl_outs[0].split(":", 1)[1]) else: if src.startswith("../") or "/../" in src: failures.append((rlabel, f"'src' must be within package '{pkg}', got '{src}'")) src_base = os.path.basename(src) if not (src_base and src_base.endswith(".idl")): failures.append((rlabel, f"'src' must resolve to a .idl file, got: {src_base or src}")) continue if not name.endswith("_gen"): failures.append((rlabel, "Target name must end with '_gen'")) stem_from_name = name[:-4] if name.endswith("_gen") else name stem_from_src = src_base[:-4] if stem_from_name != stem_from_src: failures.append( ( rlabel, f"Stem mismatch: name '{name}' vs src '{src_base}'. " f"Expected src basename '{stem_from_name}.idl'.", ) ) if failures: for lbl, msg in failures: print(f"IDL naming violation: {lbl}: {msg}") if generate_report: report = make_report(lbl, msg, 1) try_combine_reports(report) put_report(report) # print(time.time() - start) if fix and failures: sys.exit(1) def validate_private_headers(generate_report: bool, fix: bool) -> None: """ Fast header linter/fixer using concurrent buildozer reads: buildozer print label srcs //:% - Lints if any header appears anywhere in the printed block (including select()/glob()). - Auto-fixes ONLY concrete items in the first [...] (top-level list). - Fails the run if a non-concrete header is detected (select()/glob()). """ import re import subprocess import sys from concurrent.futures import ThreadPoolExecutor, as_completed from shlex import split as shlex_split # ---- Config ---- HEADER_EXTS = (".h", ".hh", ".hpp", ".hxx") HEADER_RE = re.compile(r"\.(h|hh|hpp|hxx)\b") PUBLIC_KEEP = { "//src/mongo/platform:basic.h", "//src/mongo/platform:windows_basic.h", } SCOPE = "//src/mongo/..." # limit to your subtree MACRO_SELECTORS = [ "%mongo_cc_library", "%mongo_cc_binary", "%mongo_cc_unit_test", "%mongo_cc_benchmark", "%mongo_cc_integration_test", "%mongo_cc_fuzzer_test", "%mongo_cc_extension_shared_library", ] SKIP_SUFFIXES = ("_shared_archive", "_hdrs_wrap") SKIP_PKG_SUBSTR = "/third_party/" # If True, exit(1) whenever a header is found only via select()/glob() FAIL_ON_STRUCTURED = True buildozer = install_buildozer() def _run_print(selector: str) -> tuple[str, str]: """Run one buildozer print invocation; return (selector, stdout).""" try: out = subprocess.run( [buildozer, "print label srcs", f"{SCOPE}:{selector}"], capture_output=True, text=True, check=True, ).stdout return selector, out except subprocess.CalledProcessError as exc: # surface error and keep going (treated as empty output) print(f"BUILDOZER ERROR (print label srcs) for selector {selector}:", file=sys.stderr) print(exc.stdout, file=sys.stderr) print(exc.stderr, file=sys.stderr) return selector, "" # 1) Run all macro prints concurrently outputs: list[str] = [] with ThreadPoolExecutor(max_workers=min(4, max(1, len(MACRO_SELECTORS)))) as ex: futs = [ex.submit(_run_print, sel) for sel in MACRO_SELECTORS] for fut in as_completed(futs): _, stdout = fut.result() if stdout: outputs.append(stdout) if not outputs: return combined = "\n".join(outputs) # 2) Parse into target blocks: start at lines beginning with //src/mongo... target_line_re = re.compile(r"^//src/mongo/[^:\s\[]+:[^\s\[]+") lines = combined.splitlines() blocks: list[tuple[str, list[str]]] = [] cur_target: str | None = None cur_buf: list[str] = [] def flush(): nonlocal cur_target, cur_buf if cur_target is not None: blocks.append((cur_target, cur_buf)) cur_target, cur_buf = None, [] for line in lines: if target_line_re.match(line): flush() cur_target = line.split()[0] cur_buf = [line] elif cur_target is not None: cur_buf.append(line) flush() failures: list[tuple[str, str]] = [] fixes: list[tuple[str, str]] = [] # (cmd, target) structured_fail_found = False # to enforce FAIL_ON_STRUCTURED def pkg_of(label: str) -> str: return label[2:].split(":", 1)[0] def normalize_token(pkg: str, tok: str) -> str | None: t = tok.strip().strip(",") if not t: return None if t.startswith(("select(", "glob(")): return None if t.startswith("//"): return t if t.startswith(":"): return f"//{pkg}:{t[1:]}" # bare filename/path → pkg-local if not any(ch in t for ch in " []{}:\t\n"): return f"//{pkg}:{t}" return None for target, buf in blocks: if target.endswith(SKIP_SUFFIXES) or SKIP_PKG_SUBSTR in target: continue text = "\n".join(buf) # quick lint: any .h* anywhere? if not HEADER_RE.search(text): continue # first [...] only (top-level list) m = re.search(r"\[(.*?)\]", text, flags=re.DOTALL) top_tokens: list[str] = [] if m: inner = m.group(1).replace("\n", " ").strip() if inner: try: top_tokens = shlex_split(inner) except ValueError: top_tokens = inner.split() pkg = pkg_of(target) concrete_headers: list[str] = [] for tok in top_tokens: norm = normalize_token(pkg, tok) if not norm: continue if norm in PUBLIC_KEEP: continue base = norm.split(":", 1)[1] if base.endswith(HEADER_EXTS): concrete_headers.append(norm) structured_has_hdr = False if not concrete_headers: # If there were headers somewhere but none in first [...], we assume select()/glob() structured_has_hdr = True if not concrete_headers and not structured_has_hdr: continue canon_target = target.replace("_with_debug", "") parts = [] if concrete_headers: parts.append(f"concrete headers: {concrete_headers}") if structured_has_hdr: parts.append("headers via select()/glob() (not auto-fixed)") structured_fail_found = True msg = f"{canon_target} has headers in srcs: " + "; ".join(parts) print(msg) failures.append((canon_target, msg)) if fix and concrete_headers: for h in concrete_headers: fixes.append((f"add private_hdrs {h}", canon_target)) fixes.append((f"remove srcs {h}", canon_target)) # 3) Apply fixes (dedupe) if fix and fixes: seen = set() for cmd, tgt in fixes: key = (cmd, tgt) if key in seen: continue seen.add(key) subprocess.run([buildozer, cmd, tgt]) # 4) CI reports if failures and generate_report: for tlabel, msg in failures: report = make_report(tlabel, msg, 1) try_combine_reports(report) put_report(report) # 5) Failing rules # - Always fail if any violation and not fixing (your existing behavior) # - Also fail if we saw non-concrete (structured) headers anywhere (requested) if (failures and not fix) or (structured_fail_found and FAIL_ON_STRUCTURED): sys.exit(1) def main(): parser = argparse.ArgumentParser() parser.add_argument("--generate-report", default=False, action="store_true") parser.add_argument("--fix", default=False, action="store_true") args = parser.parse_args() validate_clang_tidy_configs(args.generate_report, args.fix) validate_bazel_groups(args.generate_report, args.fix) validate_idl_naming(args.generate_report, args.fix) validate_private_headers(args.generate_report, args.fix) if __name__ == "__main__": main()