Files
mongo/modules_poc/mod_scanner.py
Mathias Stearn 641ecf1b01 SERVER-98435 Work around bug in python codeowners lib (#35032)
GitOrigin-RevId: 5f00ba628bac5f1b58ef907acee47b687585a45f
2025-04-16 10:21:48 +00:00

1002 lines
35 KiB
Python
Executable File

#!/usr/bin/env python3
import dataclasses
import functools
import itertools
import json
import os
import sys
from dataclasses import dataclass
from datetime import datetime
from functools import cache, cached_property
from glob import glob
from pathlib import Path # if you haven't already done so
from typing import NoReturn
import codeowners
import pyzstd
import regex as re
import yaml
from codeowners import CodeOwners
try:
from yaml import CDumper as Dumper
from yaml import CLoader as Loader
except ImportError:
raise RuntimeError("Why no cYaml?")
# from yaml import Loader, Dumper
file = Path(__file__).resolve()
parent, root = file.parent, file.parents[1]
sys.path.append(str(file.parent))
# os.chdir(parent.parent) # repo root (uncomment for python debugger)
import cindex as clang
from cindex import Config, Cursor, CursorKind, Index, LinkageKind, RefQualifierKind, TranslationUnit
from cindex import File as ClangFile
# Monkey patch some features into clang's python binding. Keeping commented out for now in case we decide not to use modified lib.
# clang.functionList.append(("clang_File_isEqual", [ClangFile, ClangFile], ctypes.c_int))
# clang.functionList.append(("clang_Cursor_hasAttrs", [Cursor], ctypes.c_uint))
# clang.Cursor.__hash__ = lambda self: self.hash
# clang.File.__eq__ = lambda self, other: other is not None and bool(
# clang.conf.lib.clang_File_isEqual(self, other)
# )
# def get_specialized_template(node: Cursor):
# return Cursor.from_cursor_result(clang.conf.lib.clang_getSpecializedCursorTemplate(node), node)
# def has_attrs(node: Cursor):
# return node.has_attrs()
def is_tu(c: Cursor | CursorKind):
if isinstance(c, Cursor):
c = c.kind
return c == CursorKind.TRANSLATION_UNIT
out_from_env = os.environ.get("MOD_SCANNER_OUTPUT", None)
is_local = out_from_env is None
# Copied from
# https://github.com/sbdchd/codeowners/blob/53a7a9533ab455b0aa3f35f599558a2e1a1e97b7/codeowners/__init__.py#L17-L108
# then modified to correctly handle **, fixing https://github.com/sbdchd/codeowners/issues/43.
def path_to_regex(pattern: str):
"""
ported from https://github.com/hmarr/codeowners/blob/d0452091447bd2a29ee508eebc5a79874fb5d4ff/match.go#L33
MIT License
Copyright (c) 2020 Harry Marr
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
regex = ""
slash_pos = pattern.find("/")
anchored = slash_pos > -1 and slash_pos != len(pattern) - 1
regex += r"\A" if anchored else r"(?:\A|/)"
matches_dir = pattern[-1] == "/"
matches_no_subdirs = pattern[-2:] == "/*"
pattern_trimmed = pattern.strip("/")
in_char_class = False
escaped = False
iterator = enumerate(pattern_trimmed)
for i, ch in iterator:
if escaped:
regex += re.escape(ch)
escaped = False
continue
if ch == "\\":
escaped = True
elif ch == "*":
if i + 1 < len(pattern_trimmed) and pattern_trimmed[i + 1] == "*":
left_anchored = i == 0
leading_slash = i > 0 and pattern_trimmed[i - 1] == "/"
right_anchored = i + 2 == len(pattern_trimmed)
trailing_slash = i + 2 < len(pattern_trimmed) and pattern_trimmed[i + 2] == "/"
if (left_anchored or leading_slash) and (right_anchored or trailing_slash):
# MONGO CHANGE vvvv
if trailing_slash:
# Match behavior of glob.translate() from Python 3.13+
# https://github.com/python/cpython/blob/0879ebc953fa7372a4d99f3f79889093f04cac67/Lib/glob.py#L291
regex += "(?:.*/)?"
else:
regex += ".*"
# ORIG CODE vvvv
# regex += ".*"
# MONGO CHANGE ^^^^
next(iterator, None)
next(iterator, None)
continue
regex += "[^/]*"
elif ch == "?":
regex += "[^/]"
elif ch == "[":
in_char_class = True
regex += ch
elif ch == "]":
if in_char_class:
regex += ch
in_char_class = False
else:
regex += re.escape(ch)
else:
regex += re.escape(ch)
if in_char_class:
raise ValueError(f"unterminated character class in pattern {pattern}")
if matches_dir:
regex += "/"
elif matches_no_subdirs:
regex += r"\Z"
else:
regex += r"(?:\Z|/)"
return re.compile(regex)
# Monkey-patch codeowners lib to work around https://github.com/sbdchd/codeowners/issues/43.
# This must be done prior to the first usage of the library.
codeowners.path_to_regex = path_to_regex
with open(root / ".github/CODEOWNERS") as f:
code_owners = CodeOwners(f.read())
with open(parent / "modules.yaml") as f:
def parseModules():
raw_mods = yaml.load(f, Loader=Loader)
lines = []
for mod, globs in raw_mods.items():
for glob in globs:
lines.append(f"/{glob} @10gen/{mod}")
if glob.endswith(".idl"):
lines.append(f"/{glob[:-4]}_gen.* @10gen/{mod}")
# If multiple rules match, later wins. So put rules with more
# specificity later. For all of our current rules, longer means more
# specific.
lines.sort(key=lambda l: len(l.split()[0]))
return "\n".join(lines)
modules = CodeOwners(parseModules())
def perr(*values):
print(*values, file=sys.stderr)
def perr_exit(*values) -> NoReturn:
perr(*values)
sys.exit(1)
class DecoratedCursor(Cursor):
# All USRs start with 'c:'. Local USRs then have a filename+'@' followed by
# an optional number+'@'. Global USRs just start with 'c:@'
_USR_GLOBALIZER_REGEX = re.compile(r"c:[\w\.\-]+@(\d+@)?")
# CursorKinds that represent types. For these we prefer definition locations.
# This was decided by manually examining the unique kinds from the output.
_TYPE_KINDS = {
CursorKind.ENUM_DECL,
CursorKind.STRUCT_DECL,
CursorKind.UNION_DECL,
CursorKind.CLASS_DECL,
CursorKind.CLASS_TEMPLATE,
CursorKind.CLASS_TEMPLATE_PARTIAL_SPECIALIZATION,
# Unsure about these:
# CursorKind.TYPE_ALIAS_DECL,
# CursorKind.TYPEDEF_DECL,
# CursorKind.TYPE_ALIAS_TEMPLATE_DECL,
}
def __init__(self, c: Cursor):
# Unfortunately, need to decompose and have Cursor constructor recompose.
super().__init__(c._kind_id, c.xdata, c.data)
self._tu = c._tu
@staticmethod
@cache
def normalize(c: Cursor):
assert c.kind != CursorKind.NAMESPACE # Should not be called with this.
# unresolve implicit instantiations
while templ := c.specialized_template:
# Clang unfortunate behavior: The method to "unspecialize" a template
# will both go from implicit instantiation to the template *and* go from
# an explicit specialization to the primary template. Ideally, we
# would only do the first, but that isn't an option. So we try to fake
# it by only using the result if the locations are the same. However,
# in some cases (notably including template methods of a template class),
# clang will jump from the definition to the declaration, and neither
# orig.canonical or result.get_definition() works to get the same location.
# So we compromise: fully unspecialize non-type templates (variables and
# functions), but require the locs to match on types. This is important
# because class specializations can have different members than their
# primary template and we want to handle those correctly. We ignore all
# child declarations of functions, so that isn't a problem there.
# This still chokes on explicit and extern template instantiations, but
# it isn't clear how to fix that.
if c.kind in DecoratedCursor._TYPE_KINDS:
if templ.location != c.location and templ.extent != c.extent:
templ_def = templ.get_definition()
if not templ_def or templ_def.location != c.location:
break
c = templ
usr = c.get_usr()
definition = c.get_definition()
# In clang terms, the "canonical" declaration is the first one seen in the TU.
canonical = c.canonical
assert canonical
if c.kind not in (
CursorKind.TYPEDEF_DECL,
CursorKind.TYPE_ALIAS_DECL,
): # Hit a clang bug :(
assert canonical.get_usr() == usr
if definition:
assert definition.get_usr() == usr
# For types, prefer the definition if it is in a header, otherwise use the canonical decl.
c = canonical
if c.kind in DecoratedCursor._TYPE_KINDS:
if definition and definition.location.file.name.endswith(".h"):
c = definition
return DecoratedCursor(c)
@cached_property
def raw_parent(self):
if is_tu(self.semantic_parent):
# We never want to treat TUs as parents.
return
assert self.semantic_parent
return DecoratedCursor(self.semantic_parent)
@cached_property
def normalized_parent(self):
if not self.raw_parent:
return None
if self.raw_parent.kind == CursorKind.NAMESPACE:
return self.raw_parent # Note: returning same object to share cached properties.
return DecoratedCursor.normalize(self.raw_parent)
@property
def normalized_parents(self):
p = self.normalized_parent
while p and not is_tu(p):
yield p
p = p.normalized_parent
@cached_property
def raw_usr(self):
return self.get_usr()
@cached_property
def globalized_usr(self):
"""
Removes the file and unique number clang adds to some USRs without external linkage.
This includes (among other cases) anything that has a lambda as part of its type,
and namspace-scope constant integers. This interferes with our normalizing of USRs
because it breaks the rule that everything's USR starts with its partent's USR.
Globalizing restores that property.
I have manually verified that this does not cause problematic collisions between USRs.
There were only 4 groups of declarations that ended up with the same USR after
globalizing. 3 were all function-local lambdas that get filtered out with other
function-local declarations, and the last was the decay operator for lambdas
used to build a hand-rolled VTable in a class's private section.
"""
usr = DecoratedCursor._USR_GLOBALIZER_REGEX.sub("c:@", self.raw_usr)
return usr
@cached_property
def normalized_usr(self):
"""
Like globalized_usr, but replaces the raw_parent's USR prefix with the normalized _parent's USR
"""
usr = self.globalized_usr
if not usr or self.kind == CursorKind.NAMESPACE or not self.raw_parent:
# Namespaces don't undergo any normalization, so we can break the cycle here.
return usr
assert usr.startswith(self.raw_parent.globalized_usr)
return self.normalized_parent.normalized_usr + usr[len(self.raw_parent.globalized_usr) :]
@cached_property
def definition(self):
d = self.get_definition()
if not d:
return None
if d == self:
return self # keep cache
return DecoratedCursor(self)
@property # no need to cache
def has_definition(self):
return self.definition is not None
DETAIL_REGEX = re.compile(r"(detail|internal)s?$")
def get_visibility(c: DecoratedCursor, scanning_parent=False):
if c.has_attrs():
for child in c.get_children():
if child.kind != CursorKind.ANNOTATE_ATTR:
continue
terms = child.spelling.split("::")
if not (len(terms) >= 3 and terms.pop(0) == "mongo" and terms.pop(0) == "mod"):
continue
if terms[0] == "shallow":
terms.pop(0)
assert terms
if scanning_parent:
continue # shallow doesn't apply to children
attr = terms.pop(0)
if terms:
alt = "::".join(terms)
assert attr in ("use_replacement",)
else:
alt = None
assert attr in (
"public",
"private",
"file_private",
"needs_replacement",
)
return (attr, alt)
# Some rules for implicitly private decls
# TODO: Unfortunately these rules are violated on 64 declarations,
# so it can't be enabled yet.
#
# - Some of the forTest methods appear to be intended as helpers for
# consumers writing tests. We may want to use a different suffix like
# "forTests" for that.
# - The usages of details namespace violations are more tricky, and there
# appear to be a few kinds:
# - True violations: we should fix these.
# - Files not mapped to modules correctly: we should fix the mapping.
# - APIs intended to be used from macro implementations: We might be
# able to fix these by using clang_getFileLocation rather than
# clang_getInstantiationLocation, but I don't think we want to do
# that everywhere and it isn't currently exposed from python.
# For now we may just want to mark those as public.
# - Types not intended to be named directly by consumers, but used as
# part of public APIs (eg return types or base classes) such that
# consumers are expected to use their APIs. Maybe they should be
# declared public anyway?
if 0: # :(
if c.spelling.endswith("forTest"):
return "private"
# details and internal namespaces
if c.kind == CursorKind.NAMESPACE and DETAIL_REGEX.match(c.spelling):
return "private"
if not c.normalized_parent:
return ("UNKNOWN", None) # break recursion
return get_visibility(c.normalized_parent, scanning_parent=True)
def normpath_for_file(f: ClangFile | str | None) -> str | None:
if f is None:
return None
name = f.name if type(f) == ClangFile else f
if "/third_party/" in name:
return None
offset = name.find("src/mongo")
if offset == -1:
return None
name = name[offset:]
return os.path.normpath(name) # fix up a/X/../b/c.h -> a/b/c.h
file_mod_map: dict[str | None, str | None] = {None: None}
def mod_for_file(f: ClangFile | str | None) -> str | None:
name = normpath_for_file(f)
if name in file_mod_map:
return file_mod_map[name]
match modules.of(name):
case []:
mod = "__NONE__"
case [[kind, mod]]:
assert kind == "TEAM"
ignore = "@10gen/"
assert mod.startswith(ignore)
mod = mod[len(ignore) :]
case owners:
perr_exit(
f"ERROR: multiple owners for file {name}: {', '.join(mod for (_, mod) in owners)}"
)
file_mod_map[name] = mod
return mod
def teams_for_file(f: ClangFile | str | None):
name = normpath_for_file(f)
if name is None:
return []
# No need to cache since this is called once per file
teams = []
for kind, owner in code_owners.of(name):
if kind != "TEAM": # ignore both individual engineers and svc-auto-approve-bot
continue
ignore = "@10gen/"
assert owner.startswith(ignore)
owner = owner[len(ignore) :]
owner = owner.replace("-", "_") # easier for processing with jq
teams.append(owner)
return teams if teams else ["__NO_OWNER__"]
@dataclass
class Decl:
display_name: str
usr: str
raw_usr: str
# mangled_name: str
loc: str
kind: str
mod: str | None
linkage: str
defined: bool
spelling: str
visibility: str
alt: str
sem_par: str
lex_par: str
used_from: dict[str, set[str]] = dataclasses.field(default_factory=dict, compare=False)
def def_or_decled(self) -> str:
return "defined" if self.defined else "declared"
@staticmethod
def from_cursor(c: Cursor, mod=None):
if not isinstance(c, DecoratedCursor):
c = DecoratedCursor(c)
vis, alt = get_visibility(c)
return Decl(
display_name=fully_qualified(c),
spelling=c.spelling,
usr=c.normalized_usr,
raw_usr=c.raw_usr,
# mangled_name=c.mangled_name,
loc=pretty_location(c.location),
linkage=c.linkage.name,
kind=c.kind.name,
mod=mod or mod_for_file(c.location.file),
defined=c.has_definition,
visibility=vis,
alt=alt,
sem_par=c.normalized_parent.normalized_usr if c.normalized_parent else None,
lex_par=(
DecoratedCursor(c.lexical_parent).normalized_usr
if not is_tu(c.lexical_parent)
else None
),
)
def pretty_location(loc: clang.SourceLocation | clang.Cursor):
if isinstance(loc, Cursor):
if loc.location.file:
loc = loc.location
else:
# Clang bug: For some reason, usages of conversion operators lack a
# location, but have an extent. Use the start of the extent instead.
extent_start = loc.extent.start # type: clang.SourceLocation
loc = extent_start
name = os.path.normpath(loc.file.name) if loc.file else "<unknown>"
# return f"{name}({loc.line},{loc.column})" # MSVC format
return f"{name}:{loc.line}:{loc.column}" # gcc format
decls = dict[str, Decl]()
def fully_qualified(c: DecoratedCursor):
parts = []
for c in itertools.chain((c,), c.normalized_parents):
spelling = c.displayname
if spelling:
if c.is_const_method():
spelling += " const"
match c.type.get_ref_qualifier():
case RefQualifierKind.LVALUE:
spelling += " &"
case RefQualifierKind.RVALUE:
spelling += " &&"
parts.append(spelling)
if not parts:
return ""
if parts[-1] == "mongo":
parts.pop()
else:
parts.append("")
parts.reverse()
return "::".join(parts)
def add_decl(d: Decl):
if d.usr not in decls:
decls[d.usr] = d
return
old = decls[d.usr]
if old.mod != d.mod:
perr(
f"{d.loc}:warning: {d.kind} {d.display_name} {d.def_or_decled()} in module {d.mod} "
+ f"after previously being {old.def_or_decled()} in module {old.mod}"
)
perr(f"{old.loc}:note: prior definition here")
if d.defined and old.defined:
# print(d.kind)
# print(d.kind == CursorKind.TYPEDEF_DECL)
# if d.kind == CursorKind.TYPEDEF_DECL:
# return # TODO: how to handle this?
if d == old:
return # it doesn't matter, ignore it
if not any(
special_case in d.display_name
for special_case in ("(unnamed ", "UFDeductionHelper", "<IsConst, IndexScanStats>")
) and not d.spelling.startswith("(anonymous "):
return # ignore
print("detected duplicate definitions!")
print(d.loc, d)
print(old.loc, old)
assert not (d.defined and old.defined)
if d.defined and not old.defined:
assert not d.used_from
d.used_from = old.used_from
decls[d.usr] = d
# TODO consider merging otherwise?
# These are completely skipped during decl finding
skip_kinds = {
# parameters
CursorKind.PARM_DECL,
CursorKind.TEMPLATE_TYPE_PARAMETER,
CursorKind.TEMPLATE_NON_TYPE_PARAMETER,
CursorKind.TEMPLATE_TEMPLATE_PARAMETER,
# Function bodies
CursorKind.COMPOUND_STMT,
CursorKind.CXX_TRY_STMT,
# Useless
CursorKind.CXX_ACCESS_SPEC_DECL, # doesn't have children
CursorKind.STATIC_ASSERT,
#
# TODO Consider for future for things like hidden friends
CursorKind.FRIEND_DECL,
}
skip_mods: tuple[str, ...] = ()
def find_decls(mod: str, c: Cursor):
if c.location.file:
assert mod_for_file(c.location.file) == mod # maybe
if c.kind.is_declaration() and c.kind != CursorKind.NAMESPACE and c.spelling:
add_decl(Decl.from_cursor(c))
if c.kind == CursorKind.TYPE_ALIAS_TEMPLATE_DECL:
return
for child in c.get_children():
if child.kind in skip_kinds:
continue
if child.kind.is_attribute():
continue
find_decls(mod, child)
function_kinds = {
CursorKind.CONSTRUCTOR,
CursorKind.CONVERSION_FUNCTION,
CursorKind.CXX_METHOD,
CursorKind.DESTRUCTOR,
CursorKind.FUNCTION_DECL,
CursorKind.FUNCTION_TEMPLATE,
}
def is_local_decl(c: Cursor):
assert c.kind.is_declaration
# Checking linkage first avoids doing expensive check for things we know can't be local.
if c.linkage not in (LinkageKind.NO_LINKAGE, LinkageKind.INTERNAL):
return False
# Important: this skips over the input c itself, since we don't want to consider
# functions as local decls, unless they are inside of another function.
while (c := c.semantic_parent) and not is_tu(c):
if c.kind in function_kinds:
return True
return False
def find_usages(mod: str, c: Cursor):
ref = c.referenced
# Handle children first. This makes it possible to use early returns below
for child in c.get_children():
# Don't count friendship as a "usage". This causes problems since the friend decl
# becomes the canonical decl for the type for any TU that doesn't see the definition.
# "Hidden friend" definitions *are* traversed.
if c.kind == CursorKind.FRIEND_DECL and not child.is_definition():
return
assert child != c
assert ref is None or child != ref or ref.kind == CursorKind.OVERLOADED_DECL_REF
find_usages(mod, child)
if ref is None or ref == c:
return
if ref.kind in (
CursorKind.NAMESPACE,
CursorKind.NAMESPACE_ALIAS,
CursorKind.TEMPLATE_TEMPLATE_PARAMETER,
CursorKind.TEMPLATE_TYPE_PARAMETER,
CursorKind.TEMPLATE_NON_TYPE_PARAMETER,
CursorKind.PARM_DECL,
CursorKind.NO_DECL_FOUND,
):
return
if ref.kind == CursorKind.OVERLOADED_DECL_REF:
# These come up when parsing a dependently-typed call. Unfortunately they
# are not very useful, so they are one of many cases where we can't get
# good info out of templates.
assert not ref.get_usr()
return
# NOTE: This is for templated variables and their specializations. Ideally these
# would be tracked, but we only have 27 template variables (4 of which are used
# cross-module) and they are generating thousands of unique declarations because
# libclang doesn't expose enough info for us to merge them well. This massively
# skews the results because they are 10% of all decls!
# TODO: we should at least check that private decls aren't used from the wrong mod
# before returning.
if ref.kind == CursorKind.UNEXPOSED_DECL:
return
if not ref.canonical.location.file:
# These are pre-declared in the compiler with no source location. In some cases,
# they are redeclared in the stdlib, but canonicalization points them back
# at the internal declaration. Make sure that this isn't causing us to skip
# any first-party declarations.
assert not ref.location.file or mod_for_file(ref.location.file) is None
return
if is_local_decl(ref):
return
# Unfortuntely libclang's c api doesn't handle implicitly declared methods
# well. In particular it often points at a location of a forward decl of the
# class rather than the definition, even if both are visible. And then the
# rest of our handling doesn't work correctly. And it also doesn't have a
# way to distinguish implicit methods from explicitly defaulted ones. So we
# just resolve all defaulted methods to the type and continue from there.
if ref.is_default_method():
ref = ref.semantic_parent
# assert not c.location.file or mod_for_file(c.location.file) == mod
ref = DecoratedCursor.normalize(ref)
# Ignore any declarations not declared in a header.
# TODO what if a local type is passed to a template? For now doesn't matter because we
# don't look at usages from instantiations.
if ref.location.file.name.endswith(".cpp"):
return
usr = ref.normalized_usr
if not usr:
return
if usr in decls:
# We've already done the work to get the info for this decl.
d = decls[usr]
else:
decl_mod = mod_for_file(ref.location.file)
if not decl_mod or decl_mod in skip_mods:
return
d = Decl.from_cursor(ref, decl_mod)
decls[usr] = d
if ref.definition and ref != ref.definition:
def_mod = mod_for_file(ref.definition.location.file)
# Note def_mod is None means third_party, not __NONE__ module
if def_mod != decl_mod and def_mod is not None:
print(f"WARNING: {d.display_name} is declared and defined in different modules")
print(f" decl: {pretty_location(ref)} ({decl_mod})")
print(f" defn: {pretty_location(ref.definition)} ({def_mod})")
# ignore usages from the same module
# if d.mod == mod or mod.startswith(d.mod):
# return
d.used_from.setdefault(mod, set()).add(pretty_location(c))
seen = set[Cursor]()
def ast(node: Cursor):
templ = node.specialized_template
usr = node.get_usr()
if node in seen:
return {
"b_kind": node.kind.name,
"c_usr": usr,
"d_display": node.displayname,
"e_location": pretty_location(node.location),
}
seen.add(node)
if 0: # toggle filtering
children = [ast(c) for c in node.get_children()]
else:
children = []
for c in node.get_children():
if c.location.file is None:
children.append(ast(c))
continue
if "src/mongo/" not in c.location.file.name:
continue
if c.kind == CursorKind.COMPOUND_STMT:
continue
children.append(ast(c))
return {
"b_kind": node.kind.name,
"c_par_usr": str(node.semantic_parent.get_usr() if node.semantic_parent else None),
"c_usr": str(usr),
"d_display": str(node.displayname),
"d_spelling": str(node.spelling),
"e_location": pretty_location(node.location),
"ee_mod": mod_for_file(node.location.file),
# "f_extent.start": str(node.extent.start),
# "g_extent.end": str(node.extent.end),
"h_is_definition": node.is_definition(),
"h_is_decl": node.kind.is_declaration(),
"h_linkage": node.linkage.name,
"z_ref": (ast(node.referenced) if node.referenced and node.referenced != node else None),
"z_templ": ast(templ) if templ else None,
"zz_children": children,
}
class Timer:
def __init__(self):
self.start = datetime.now()
def mark(self, label: str):
if is_local:
elapsed = datetime.now() - self.start
print(f"{label}: {elapsed}")
timer = Timer()
# TODO: this should probably be pulled out to a separate program, with all functions
# only called by it moved out as well. That requires pulling mod_for_file() out to a lib.
# It is only part of mod_scanner because it needs that function.
def dump_modules() -> None:
out: dict[str, dict[str, dict[str, list[str]]]] = {}
for path in glob("src/mongo/**/*", recursive=True):
if "/third_party/" in path:
continue
extensions = ("h", "cpp", "idl", "c", "defs", "inl", "hpp")
if not any(path.endswith(f".{ext}") for ext in extensions):
continue
mod = mod_for_file(path)
assert mod # None would mean not first-party, but that is already filtered out.
(dir, leaf) = path.rsplit("/", 1)
for team in teams_for_file(path):
# In cases where multiple teams own a file, this will list the file multiple times.
# This is intended to play nicely with teams trying to filter to just the files they own.
out.setdefault(mod, {}).setdefault(team, {}).setdefault(dir, []).append(leaf)
for teams in out.values():
for dirs in teams.values():
for files in dirs.values():
files.sort()
yaml.dump(out, open("modules.yaml", "w"))
def parseTU(args: list[str] | str):
if not Config.loaded:
Config.set_compatibility_check(False)
external = "external" if os.path.exists("external") else "bazel-out/../../../external"
paths_to_try = [
f"{external}/mongo_toolchain_v5/v5/lib/libclang.so",
f"{external}/mongo_toolchain_v4/v4/lib/libclang.so",
f"{external}/mongo_toolchain/v4/lib/libclang.so",
]
for path in paths_to_try:
if os.path.exists(path):
Config.set_library_file(path)
break
else:
path_lines = "\n\t".join(paths_to_try) # can't have \ in f-string expr
perr_exit(f"Unable to find libclang.so. Paths tried:\n\t{path_lines}")
# Config.set_library_file("/home/ubuntu/clang+llvm-19.1.1-aarch64-linux-gnu/lib/libclang.so")
if type(args) == str:
args = [args]
if len(args) == 1:
compdb = clang.CompilationDatabase.fromDirectory(".")
commands = compdb.getCompileCommands(args[0])
if commands is None:
perr_exit(f"no compile commands for {args[0]}")
if len(commands) != 1:
perr_exit(f"too many compile commands for {args[0]}", commands)
# print(" ".join(commands[0].arguments))
args = list(commands[0].arguments)[1:] # skip executable
# somehow clang implicitly adds args that it warns about
cleanArgs = ["-Wno-unused-command-line-argument"]
for arg in args:
if arg in ("-MD", "-MMD", "-MF"):
continue
if arg.endswith(".d"):
continue
cleanArgs.append(arg)
# print(arg)
# Disable all warnings. Don't waste time on them when parsing.
cleanArgs.append("-w")
index = Index.create()
timer.mark("preparse")
tu = index.parse(None, cleanArgs)
if not tu:
raise RuntimeError("unable to load input")
for d in tu.diagnostics:
perr(d)
timer.mark("parsed")
return tu
def dump_unused_inputs(outPath: str, tu: TranslationUnit):
# only looking in src/mongo to cut down on resources, and to reduce the risk of accidentally
# including some file we shouldn't. Assumption is that third_party and generated sources won't
# change in a tight feedback loop.
universe = set(glob("src/mongo/**/*.h", recursive=True))
timer.mark("globbed")
for include in tu.get_includes():
if include.source:
universe.discard(include.source.name)
with open(outPath, "w") as file:
file.write("\n".join(sorted(universe)))
timer.mark("outfile written")
def main():
args = sys.argv[1:] or ["src/mongo/platform/waitable_atomic_test.cpp"]
if len(args) == 0:
perr_exit("invalid number of arguments")
if args == ["--dump-modules"]:
dump_modules()
sys.exit()
tu = parseTU(args)
if unused_input_path := os.environ.get("MOD_SCANNER_UNUSED", None):
dump_unused_inputs(unused_input_path, tu)
assert is_tu(tu.cursor)
if "DUMP_AST" in os.environ and is_local: # useful for debugging (never on bazel)
out = ast(tu.cursor)
timer.mark("ast processed")
with open("ast.yaml", "w") as f:
yaml.dump(out, f, Dumper=Dumper)
timer.mark("ast dumped")
# for top_level in tu.cursor.get_children():
# if "src/mongo/" not in top_level.location.file.name:
# continue
# find_decls(mod_for_file(top_level.location.file), top_level)
# timer.mark("found decls")
for top_level in tu.cursor.get_children():
if "src/mongo/" not in top_level.location.file.name:
continue
find_usages(mod_for_file(top_level.location.file), top_level)
timer.mark("found usages")
out_file_name = out_from_env if out_from_env else "decls.yaml"
if out_file_name.endswith(".zst"):
uncompressed_file_name = out_file_name[: -len(".zst")]
open_func = functools.partial(pyzstd.ZstdFile, write_size=2 * 1024 * 1024)
else:
uncompressed_file_name = out_file_name
open_func = open
with open_func(out_file_name, "w") as f:
out = [dict(d.__dict__) for d in decls.values() if d.mod not in skip_mods]
for decl in out:
# del decl["spelling"]
del decl["linkage"]
del decl["raw_usr"] # Can be helpful when debugging but not worth aggregating.
# del decl["defined"]
decl["used_from"] = {k: sorted(v) for k, v in decl["used_from"].items()}
# This makes us only output decls used cross-module. It makes merging much faster,
# but, it means that we can mask some cross-module usages if something is forward
# declared in the wrong module. Also this hides definitions from the
# merger so it can't choose canonical versions. There is still the problem of
# definitions not used from any TU where they are defined.
if 0:
for decl in out:
if decl["mod"] in decl["used_from"]:
del decl["used_from"][decl["mod"]]
out = list(filter(lambda d: d["used_from"], out))
timer.mark("processed")
if uncompressed_file_name.endswith(".json"):
f.write(json.dumps(out).encode())
else:
assert out_file_name.endswith(".yaml")
yaml.dump(out, f, Dumper=Dumper)
timer.mark("dumped")
if __name__ == "__main__":
main()