* Extend the data generation Python framework for cost calibration to support data generation for CE testing as follows: - the entry point is ce_generate_data.py, - the configuration of the generated data is in ce_generate_data_settings.py, - all collection data is exported into a single JSON file stored in 'jstests/query_golden/libs/data', and a schema file stored in the same directory * Implement a JS data loader function that also creates all indexes specified in the schema file. * Add a small JS test that shows how to load the generated JSON files into collections.
211 lines
9.5 KiB
Python
211 lines
9.5 KiB
Python
# Copyright (C) 2022-present MongoDB, Inc.
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the Server Side Public License, version 1,
|
|
# as published by MongoDB, Inc.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# Server Side Public License for more details.
|
|
#
|
|
# You should have received a copy of the Server Side Public License
|
|
# along with this program. If not, see
|
|
# <http://www.mongodb.com/licensing/server-side-public-license>.
|
|
#
|
|
# As a special exception, the copyright holders give permission to link the
|
|
# code of portions of this program with the OpenSSL library under certain
|
|
# conditions as described in each individual source file and distribute
|
|
# linked combinations including the program with the OpenSSL library. You
|
|
# must comply with the Server Side Public License in all respects for
|
|
# all of the code used other than as permitted herein. If you modify file(s)
|
|
# with this exception, you may extend this exception to your version of the
|
|
# file(s), but you are not obligated to do so. If you do not wish to do so,
|
|
# delete this exception statement from your version. If you delete this
|
|
# exception statement from all source files in the program, then also delete
|
|
# it in the license file.
|
|
#
|
|
"""Configuration of data generation for CE accuracy testing."""
|
|
|
|
from pathlib import Path
|
|
import random
|
|
import config
|
|
from random_generator import RangeGenerator, DataType, RandomDistribution, ArrayRandomDistribution
|
|
|
|
__all__ = ['database_config', 'data_generator_config']
|
|
|
|
# A string value to fill up collections and not used in queries.
|
|
HIDDEN_STRING_VALUE = '__hidden_string_value'
|
|
|
|
# Data distributions settings.
|
|
distributions = {}
|
|
|
|
string_choice_values = [
|
|
'h',
|
|
'hi',
|
|
'hi!',
|
|
'hola',
|
|
'hello',
|
|
'square',
|
|
'squared',
|
|
'gaussian',
|
|
'chisquare',
|
|
'chisquared',
|
|
'hello world',
|
|
'distribution',
|
|
]
|
|
|
|
string_choice_weights = [10, 20, 5, 17, 30, 7, 9, 15, 40, 2, 12, 1]
|
|
|
|
distributions['string_choice'] = RandomDistribution.choice(string_choice_values,
|
|
string_choice_weights)
|
|
|
|
small_query_weights = [i for i in range(10, 201, 10)]
|
|
small_query_cardinality = sum(small_query_weights)
|
|
|
|
int_choice_values = [i for i in range(1, 1000, 50)]
|
|
random.shuffle(int_choice_values)
|
|
distributions['int_choice'] = RandomDistribution.choice(int_choice_values, small_query_weights)
|
|
|
|
distributions['random_string'] = ArrayRandomDistribution(
|
|
RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 5, 10, 2)),
|
|
RandomDistribution.uniform(RangeGenerator(DataType.STRING, "a", "z")))
|
|
|
|
|
|
def generate_random_str(num: int):
|
|
strs = distributions['random_string'].generate(num)
|
|
str_list = []
|
|
for char_array in strs:
|
|
str_res = "".join(char_array)
|
|
str_list.append(str_res)
|
|
|
|
return str_list
|
|
|
|
|
|
def random_strings_distr(size: int, count: int):
|
|
distr = ArrayRandomDistribution(
|
|
RandomDistribution.uniform([size]),
|
|
RandomDistribution.uniform(RangeGenerator(DataType.STRING, "a", "z")))
|
|
|
|
return RandomDistribution.uniform([''.join(s) for s in distr.generate(count)])
|
|
|
|
|
|
small_string_choice = generate_random_str(20)
|
|
|
|
distributions['string_choice_small'] = RandomDistribution.choice(small_string_choice,
|
|
small_query_weights)
|
|
|
|
string_range_4 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abca", "abc_"))
|
|
string_range_5 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abcda", "abcd_"))
|
|
string_range_7 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "hello_a", "hello__"))
|
|
string_range_12 = RandomDistribution.normal(
|
|
RangeGenerator(DataType.STRING, "helloworldaa", "helloworldd_"))
|
|
|
|
distributions['string_mixed'] = RandomDistribution.mixed(
|
|
[string_range_4, string_range_5, string_range_7, string_range_12], [0.1, 0.15, 0.25, 0.5])
|
|
|
|
distributions['string_uniform'] = RandomDistribution.uniform(
|
|
RangeGenerator(DataType.STRING, "helloworldaa", "helloworldd_"))
|
|
|
|
distributions['int_normal'] = RandomDistribution.normal(
|
|
RangeGenerator(DataType.INTEGER, 0, 1000, 2))
|
|
|
|
lengths_distr = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 10))
|
|
distributions['array_small'] = ArrayRandomDistribution(lengths_distr, distributions['int_normal'])
|
|
|
|
# Database settings
|
|
database_config = config.DatabaseConfig(
|
|
connection_string='mongodb://localhost', database_name='ce_accuracy_test', dump_path=Path(
|
|
'..', '..', 'jstests', 'query_golden', 'libs', 'data'),
|
|
restore_from_dump=config.RestoreMode.NEVER, dump_on_exit=False)
|
|
|
|
|
|
# Collection template settings
|
|
def create_index_scan_collection_template(name: str, cardinality: int) -> config.CollectionTemplate:
|
|
values = [
|
|
'iqtbr5b5is', 'vt5s3tf8o6', 'b0rgm58qsn', '9m59if353m', 'biw2l9ok17', 'b9ct0ue14d',
|
|
'oxj0vxjsti', 'f3k8w9vb49', 'ec7v82k6nk', 'f49ufwaqx7'
|
|
]
|
|
|
|
start_weight = 10
|
|
step_weight = 25
|
|
finish_weight = start_weight + len(values) * step_weight
|
|
weights = list(range(start_weight, finish_weight, step_weight))
|
|
fill_up_weight = cardinality - sum(weights)
|
|
if fill_up_weight > 0:
|
|
values.append(HIDDEN_STRING_VALUE)
|
|
weights.append(fill_up_weight)
|
|
|
|
distr = RandomDistribution.choice(values, weights)
|
|
|
|
return config.CollectionTemplate(
|
|
name=name, fields=[
|
|
config.FieldTemplate(name="choice", data_type=config.DataType.STRING,
|
|
distribution=distr, indexed=True),
|
|
config.FieldTemplate(name="mixed1", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_mixed"], indexed=False),
|
|
config.FieldTemplate(name="uniform1", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_uniform"], indexed=False),
|
|
config.FieldTemplate(name="choice2", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_choice"], indexed=False),
|
|
config.FieldTemplate(name="mixed2", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_mixed"], indexed=False),
|
|
], compound_indexes=[], cardinalities=[cardinality])
|
|
|
|
|
|
def create_physical_scan_collection_template(name: str,
|
|
payload_size: int = 0) -> config.CollectionTemplate:
|
|
template = config.CollectionTemplate(
|
|
name=name, fields=[
|
|
config.FieldTemplate(name="choice1", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_choice"], indexed=False),
|
|
config.FieldTemplate(name="mixed1", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_mixed"], indexed=False),
|
|
config.FieldTemplate(name="uniform1", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_uniform"], indexed=False),
|
|
config.FieldTemplate(name="choice", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_choice"], indexed=False),
|
|
config.FieldTemplate(name="mixed2", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_mixed"], indexed=False),
|
|
], compound_indexes=[], cardinalities=[1000, 5000])
|
|
|
|
if payload_size > 0:
|
|
payload_distr = random_strings_distr(payload_size, 100)
|
|
template.fields.append(
|
|
config.FieldTemplate(name="payload", data_type=config.DataType.STRING,
|
|
distribution=payload_distr, indexed=False))
|
|
return template
|
|
|
|
|
|
collection_cardinalities = [100] #list(range(100, 301, 100))
|
|
|
|
c_int_05 = config.CollectionTemplate(
|
|
name="c_int_05", fields=[
|
|
config.FieldTemplate(name="in1", data_type=config.DataType.INTEGER,
|
|
distribution=distributions["int_normal"], indexed=True),
|
|
config.FieldTemplate(name="mixed1", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_mixed"], indexed=False),
|
|
config.FieldTemplate(name="uniform1", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_uniform"], indexed=False),
|
|
config.FieldTemplate(name="in2", data_type=config.DataType.INTEGER,
|
|
distribution=distributions["int_normal"], indexed=True),
|
|
config.FieldTemplate(name="mixed2", data_type=config.DataType.STRING,
|
|
distribution=distributions["string_mixed"], indexed=False),
|
|
], compound_indexes=[], cardinalities=collection_cardinalities)
|
|
|
|
c_arr_01 = config.CollectionTemplate(
|
|
name="c_arr_01", fields=[
|
|
config.FieldTemplate(name="as", data_type=config.DataType.INTEGER,
|
|
distribution=distributions["array_small"], indexed=True)
|
|
], compound_indexes=[], cardinalities=collection_cardinalities)
|
|
|
|
index_scan = create_index_scan_collection_template('index_scan', 1000)
|
|
|
|
physical_scan = create_physical_scan_collection_template('physical_scan', 64)
|
|
|
|
# Data Generator settings
|
|
data_generator_config = config.DataGeneratorConfig(
|
|
enabled=True, create_indexes=False, batch_size=10000,
|
|
collection_templates=[index_scan, physical_scan, c_int_05, c_arr_01],
|
|
write_mode=config.WriteMode.REPLACE, collection_name_with_card=True)
|