Files
mongo/jstests/aggregation/sources/group/distinct_scan_optimization.js
Zac 591928c619 SERVER-108478 JS formatted by prettier and remove clang-format (#39656)
GitOrigin-RevId: 6c8f6aded47f260aa4f7c231b17dae3302cb1e04
2025-08-21 17:27:09 +00:00

183 lines
7.4 KiB
JavaScript

/**
* This is a property-based test for the group distinct scan optimization. It works by generating an
* index spec and a set of documents and using that index to construct a pipeline which will perform
* a distinct scan.
*
* @tags: [
* # Aggregation with explain may return incomplete results if interrupted by a stepdown.
* does_not_support_stepdowns,
* requires_pipeline_optimization,
* # We don't want to verify that the optimization is applied inside $facet since its shape is
* # quite different from the original one.
* do_not_wrap_aggregations_in_facets,
* cannot_run_during_upgrade_downgrade,
* ]
*/
import {FeatureFlagUtil} from "jstests/libs/feature_flag_util.js";
import {isSlowBuild} from "jstests/libs/query/aggregation_pipeline_utils.js";
import {assertPlanUsesDistinctScan} from "jstests/libs/query/group_to_distinct_scan_utils.js";
import {fc} from "jstests/third_party/fast_check/fc-3.1.0.js";
if (isSlowBuild(db)) {
jsTestLog("Exiting early because debug is on, opt is off, or a sanitizer is enabled.");
quit();
}
const coll = db.distinct_scan_optimization;
coll.drop();
const shardKey = (() => {
try {
const shardKeyFields = Object.keys(coll.getShardKey());
assert.eq(1, shardKeyFields.length);
return shardKeyFields[0];
} catch (e) {
return undefined;
}
})();
// Bail out of the test if featureFlagShardFilteringDistinctScan is disabled.
if (shardKey !== undefined && !FeatureFlagUtil.isEnabled(db, "ShardFilteringDistinctScan")) {
jsTestLog(
"Skipping distinct_scan_optimization.js because we encountered sharded collection and featureFlagShardFilteringDistinctScan is disabled",
);
quit();
}
// Areas for improvement of this test:
// * Add dotted paths
// * Support $topN and $bottomN accumulators and sortBy field
// * Generate multiple valid indexes so that we stress the distinct scan multiplanning code
const fieldArb = fc.constantFrom("_id", "a", "b", "c", "mk");
const directionArb = fc.constantFrom(1, -1);
const accumArb = fc.constantFrom("$first", "$last");
// Arbitrary representing an index spec that is eligible for the group distinct scan optimization.
const indexSpecArb = fc
.tuple(
fc.uniqueArray(fieldArb, {minLength: 2, maxLength: 5}),
directionArb, // Direction for first field
// TODO SERVER-95418: Remove this restriction when mixture of $first and $last is allowed
directionArb, // Direction for rest of the fields
)
.map(([fields, firstDirection, restDirection]) => {
const obj = {};
fields.forEach((field, index) => {
obj[field] = index === 0 ? firstDirection : restDirection;
});
return obj;
});
// Arbitrary representing values for RHS. This set is small to keep the minimization time fast and
// ensure that each bucket in a $group has multiple entries.
const fieldValueArb = fc.oneof(fc.integer({min: 1, max: 3}), fc.constantFrom("aaa", "bbb", "ccc"), fc.constant(null));
// Arbitrary like 'fieldValueArb' which cannot be null. Used for $match stage before $sort.
// Comparison to null prevents the distinct scan optimization.
const nonNullFieldArb = fc.oneof(fc.integer({min: 1, max: 3}), fc.constantFrom("aaa", "bbb", "ccc"));
// Arbitrary representing an array of integers.
const multikeyValueArb = fc.array(fc.integer(), {maxLength: 3});
// Arbitrary representing a document.
const documentArb = fc.record(
{
a: fieldValueArb,
b: fieldValueArb,
c: fieldValueArb,
// Allow for one multikey field
mk: multikeyValueArb,
},
{
noNullPrototype: false,
},
);
// Arbitrary for all documents in the collection.
const docsArb = fc.array(documentArb, {minLength: 30, maxLength: 50});
// Arbitrary for whether to include a match stage.
const includeMatchArb = fc.boolean();
// Arbitrary for a single test case.
const testCaseArb = fc.tuple(indexSpecArb, accumArb, docsArb, includeMatchArb, nonNullFieldArb);
// Takes an array of integers and returns a boolean indicating whether it contains duplicates.
function hasDuplicates(arr) {
return new Set(arr).size !== arr.length;
}
fc.assert(
fc.property(testCaseArb, ([indexSpec, accumOp, docs, includeMatch, matchValue]) => {
const fields = Object.keys(indexSpec);
// The distinct scan optimization doesn't work when the group key is multikey.
fc.pre(fields[0] !== "mk");
// If we have a shard key, it must be the group key to generate a distinct scan plan.
if (shardKey !== undefined) {
fc.pre(fields[0] === shardKey);
}
// Only the $first accumulator works with multikey fields.
//
// Suppose we have the query:
// [{$sort: {a: 1, mk: 1}}, {$group: {_id: '$a', accum: {$last: '$mk'}}}]
// This sort means we want to order documents in ascending order of the the smallest value in
// the mk array. We cannot get the last such element by using a distinct scan on the {a: 1, mk:
// 1} index in backwards direction because the index contains keys for all values of the mk
// array. The first one we'd encounter in a backwards index scan may not correspond to the
// document containing the largest smallest value in an mk array.
//
// Supose we have the other case:
// [{$sort: {a: 1, mk: -1}}, {$group: {_id: '$a', accum: {$last: '$mk'}}}]
// This sort order means order the documents by descending values of the largest element in mk.
// The same logic as described above applies.
if (indexSpec.hasOwnProperty("mk")) {
fc.pre(accumOp === "$first");
}
// Ensure that all values in the mk array across all documents are unique. This prevents a
// situation with tied sort order, which can result in multiple correct answers.
// For example:
// * Index: {a: 1, mk: 1}
// * Docs: {_id: 1, a: 1, mk: [1,2,3]}, {_id: 2, a: 1, mk: [1]}
// * Query: {$sort: {a: 1, mk: 1}}, {$group: {_id: '$a', accum: {$first: '$mk'}}}
// Both {_id: 1, accum: [1]} and {_id: 1, accum: [1,2,3]} are valid results.
fc.pre(!hasDuplicates(docs.map((doc) => doc.mk).flat()));
coll.drop();
assert.commandWorked(coll.insert(docs));
const sort = {$sort: indexSpec};
// Distinct scan only supports grouping over a single field.
// TODO SERVER-96679: Remove this restriction.
const groupId = {_id: `$${fields[0]}`};
let accumObj = {};
for (let i = 1; i < fields.length; i++) {
const curField = fields[i];
accumObj[`${curField}_accum`] = {[accumOp]: `$${curField}`};
}
let pipeline = [sort, {$group: {...groupId, ...accumObj}}];
if (includeMatch) {
const match = {
"$match": {
[fields[0]]: {$gte: matchValue},
},
};
pipeline = [match, ...pipeline];
}
jsTestLog(docs);
jsTestLog(indexSpec);
jsTestLog(pipeline);
assert.commandWorked(coll.createIndex(indexSpec));
assertPlanUsesDistinctScan(db, coll.explain().aggregate(pipeline));
const ixScanRes = coll.aggregate(pipeline).toArray();
const collScanRes = coll.aggregate(pipeline, {hint: {$natural: 1}}).toArray();
assert(_resultSetsEqualUnordered(ixScanRes, collScanRes));
}),
{seed: 5, numRuns: 500},
);