Files
mongo/buildscripts/cost_model/ce_generate_data.py
Timour Katchaounov 3ade0b9fe3 SERVER-72236 Generate random integer data for CE
Generate random data with integers. The approach is as follows:
- There is one collection for each different cardinality. All collections contain the same fields.
- Each field contains the data generated from a certain data distribution. The data could be anything - same type, mixed types, same mathematical distribution (e.g. normal), or a mixed distribution.
- The committed configuration file, and the corresponding data file are reduced to only two small collections. For actual experiments one needs to add more data sizes, and re-generate the data locally. This is done so that Evergreen tests can run fast, and to reduce the size of the git repository.
- All data is saved in a single JavaScript file: jstests/query_golden/libs/data/ce_accuracy_test.data, with a corresponding schema file jstests/query_golden/libs/data/ce_accuracy_test.schema.
- The data file is a JavaScript file that can be loaded directly inside a JS test. When loading this file, it creates a global variable dataSet. The reason is that this is the only way to load an external JSON file that doesn't need to install external tools in Evergreen.
2023-01-10 12:51:54 +00:00

152 lines
6.5 KiB
Python

# Copyright (C) 2022-present MongoDB, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the Server Side Public License, version 1,
# as published by MongoDB, Inc.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Server Side Public License for more details.
#
# You should have received a copy of the Server Side Public License
# along with this program. If not, see
# <http://www.mongodb.com/licensing/server-side-public-license>.
#
# As a special exception, the copyright holders give permission to link the
# code of portions of this program with the OpenSSL library under certain
# conditions as described in each individual source file and distribute
# linked combinations including the program with the OpenSSL library. You
# must comply with the Server Side Public License in all respects for
# all of the code used other than as permitted herein. If you modify file(s)
# with this exception, you may extend this exception to your version of the
# file(s), but you are not obligated to do so. If you do not wish to do so,
# delete this exception statement from your version. If you delete this
# exception statement from all source files in the program, then also delete
# it in the license file.
#
"""Data generation entry point."""
import asyncio
import dataclasses
import json
import os
import subprocess
from pathlib import Path
from bson.json_util import dumps
from config import CollectionTemplate, FieldTemplate, DataType
from data_generator import CollectionInfo, DataGenerator
from database_instance import DatabaseInstance
import parameters_extractor
#from ce_generate_data_settings import database_config, data_generator_config
from ce_data_settings import database_config, data_generator_config
__all__ = []
class CollectionTemplateEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, CollectionTemplate):
collections = []
for card in o.cardinalities:
name = f'{o.name}_{card}'
collections.append(
dict(collectionName=name, fields=o.fields, compound_indexes=o.compound_indexes,
cardinality=card))
return collections
elif isinstance(o, FieldTemplate):
return dict(fieldName=o.name, data_type=o.data_type, indexed=o.indexed)
elif isinstance(o, DataType):
return o.name.lower()
# Let the base class default method raise the TypeError
return super(CollectionTemplateEncoder, self).default(o)
class OidEncoder(json.JSONEncoder):
cur_oid = -1
def default(self, o):
# TODO: doesn't work, what is the type of ObjectIds?
#if isinstance(o, OectId):
if hasattr(o, '__str__'): # This will handle ObjectIds
#return str(o) this is the real OID of the document
# Replace the OID with a consequtive int number as needed by the query generator
OidEncoder.cur_oid += 1
return OidEncoder.cur_oid
return super(OidEncoder, self).default(o)
async def dump_collection_to_json(db, dump_path, database_name, collections):
with open(Path(dump_path) / f'{database_name}.data', "w") as data_file:
data_file.write('// This is a generated file.\n')
data_file.write('const dataSet = [\n')
coll_pos = 1
for coll_name in collections:
collection = db[coll_name]
doc_count = await collection.count_documents({})
doc_pos = 1
data_file.write(f'{{collName: "{coll_name}", collData: [\n')
async for doc in collection.find({}):
#data_file.write(dumps(doc))
data_file.write(json.dumps(doc, cls=OidEncoder))
if doc_pos < doc_count:
data_file.write(',')
data_file.write("\n")
doc_pos += 1
data_file.write(']}')
if coll_pos < len(collections):
data_file.write(",")
data_file.write("]\n")
async def main():
"""Entry point function."""
script_directory = os.path.abspath(os.path.dirname(__file__))
os.chdir(script_directory)
# 1. Database Instance provides connectivity to a MongoDB instance, it loads data optionally
# from the dump on creating and stores data optionally to the dump on closing.
with DatabaseInstance(database_config) as database_instance:
# 2. Generate random data and populate collections with it.
old_db_collections = await database_instance.database.list_collection_names()
for coll_name in old_db_collections:
collection = database_instance.database[coll_name]
collection.drop()
generator = DataGenerator(database_instance, data_generator_config)
await generator.populate_collections()
# 3. Export all collections in the database into json files.
db_collections = await database_instance.database.list_collection_names()
#for coll_name in db_collections:
# subprocess.run([
# 'mongoexport', f'--db={database_config.database_name}', f'--collection={coll_name}',
# f'--out={coll_name}.dat'
# ], cwd=database_config.dump_path, check=True)
await dump_collection_to_json(database_instance.database, database_config.dump_path,
database_config.database_name, db_collections)
# 4. Export the collection templates used to create the test collections into JSON file
with open(Path(database_config.dump_path) / f'{database_config.database_name}.schema',
"w") as metadata_file:
collections = []
for coll_template in data_generator_config.collection_templates:
for card in coll_template.cardinalities:
name = f'{coll_template.name}_{card}'
collections.append(
dict(collectionName=name, fields=coll_template.fields,
compound_indexes=coll_template.compound_indexes, cardinality=card))
json_metadata = json.dumps(collections, indent=4, cls=CollectionTemplateEncoder)
metadata_file.write("// This is a generated file.\nconst dbMetadata = ")
metadata_file.write(json_metadata)
metadata_file.write(";")
print("DONE!")
if __name__ == '__main__':
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
asyncio.run(main())