mongo/buildscripts/cost_model/random_generator.py

# Copyright (C) 2022-present MongoDB, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the Server Side Public License, version 1,
# as published by MongoDB, Inc.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# Server Side Public License for more details.
#
# You should have received a copy of the Server Side Public License
# along with this program. If not, see
# <http://www.mongodb.com/licensing/server-side-public-license>.
#
# As a special exception, the copyright holders give permission to link the
# code of portions of this program with the OpenSSL library under certain
# conditions as described in each individual source file and distribute
# linked combinations including the program with the OpenSSL library. You
# must comply with the Server Side Public License in all respects for
# all of the code used other than as permitted herein. If you modify file(s)
# with this exception, you may extend this exception to your version of the
# file(s), but you are not obligated to do so. If you do not wish to do so,
# delete this exception statement from your version. If you delete this
# exception statement from all source files in the program, then also delete
# it in the license file.
#
"""Random data generator of various distributions."""

from __future__ import annotations
from ctypes import Union
from dataclasses import dataclass
from enum import Enum
from itertools import chain
from typing import Generic, Sequence, TypeVar
import numpy as np

__all__ = ['RangeGenerator', 'DataType', 'RandomDistribution']


class DataType(Enum):
    """Data type enum for data generators."""

    STRING = 0
    INTEGER = 1
    FLOAT = 2


TVar = TypeVar('TVar', str, int, float)


@dataclass
class RangeGenerator(Generic[TVar]):
    """Produces a sequence of non-random data for the given interval and step."""

    data_type: DataType
    interval_begin: TVar
    interval_end: TVar
    step: int = 1

    def generate(self) -> Sequence[TVar]:
        """Generate the range."""

        gen_range_dict = {
            DataType.STRING: ansi_range, DataType.INTEGER: range, DataType.FLOAT: np.arange
        }

        gen_range = gen_range_dict.get(self.data_type)
        if gen_range is None:
            raise ValueError(f'Unsupported data type: {self.data_type}')

        return list(gen_range(self.interval_begin, self.interval_end, self.step))


def ansi_range(begin: str, end: str, step: int = 1):
    """Produces a sequence of string from begin to end."""

    alphabet_size = 28
    non_alpha_char = '_'

    def ansi_to_int(data: str) -> int:
        res = 0
        for char in data.lower():
            res = res * alphabet_size
            if 'a' <= char <= 'z':
                res += ord(char) - ord('a') + 1
            else:
                res += alphabet_size - 1
        return res

    def int_to_ansi(data: int) -> str:
        result = []
        while data != 0:
            data, remainder = divmod(data, alphabet_size)
            if remainder == alphabet_size - 1:
                char = non_alpha_char
            else:
                char = chr(remainder + ord('a') - 1)
            result.append(char)
        result.reverse()
        return ''.join(result)

    def get_common_prefix_len(s1: str, s2: str):
        index = 0
        for c1, c2 in zip(s1, s2):
            if c1 == c2:
                index += 1
            else:
                break

        return index

    prefix_len = get_common_prefix_len(begin, end)
    if prefix_len > 0:
        prefix = begin[:prefix_len]
        begin = begin[prefix_len:]
        end = end[prefix_len:]
    for number in range(ansi_to_int(begin), ansi_to_int(end), step):
        if prefix_len == 0:
            yield int_to_ansi(number)
        else:
            yield f'{prefix}{int_to_ansi(number)}'


class DistributionType(Enum):
    """An enum of distributions supported by Random Data Generator."""

    CHOICE = 0
    NORMAL = 1
    NONCENTRAL_CHISQUARE = 2
    UNIFORM = 3
    MIXED = 4


_rng = np.random.default_rng()


@dataclass
class RandomDistribution:
    """Produces random sequence of the specified values with the specified distribution."""

    distribution_type: DistributionType
    values: Union[Sequence[TVar], RangeGenerator]
    weights: Union[Sequence[float], None]

    @staticmethod
    def choice(values: Sequence[TVar], weights: Union[Sequence[float], RangeGenerator]):
        """Create choice distribution."""
        return RandomDistribution(distribution_type=DistributionType.CHOICE, values=values,
                                  weights=weights)

    @staticmethod
    def normal(values: Union[Sequence[TVar], RangeGenerator]):
        """Create normal distribution."""
        return RandomDistribution(distribution_type=DistributionType.NORMAL, values=values,
                                  weights=None)

    @staticmethod
    def noncentral_chisquare(values: Union[Sequence[TVar], RangeGenerator]):
        """Create Non Central Chi2 distribution."""
        return RandomDistribution(distribution_type=DistributionType.NONCENTRAL_CHISQUARE,
                                  values=values, weights=None)

    @staticmethod
    def uniform(values: Union[Sequence[TVar], RangeGenerator]):
        """Create uniform distribution."""
        return RandomDistribution(distribution_type=DistributionType.UNIFORM, values=values,
                                  weights=None)

    @staticmethod
    def mixed(children: Sequence[RandomDistribution],
              weight: Union[Sequence[float], RangeGenerator]):
        """Create mixed distribution."""
        return RandomDistribution(distribution_type=DistributionType.MIXED, values=children,
                                  weights=weight)

    def generate(self, size: int) -> Sequence[TVar]:
        """Generate random data sequence of the given size."""

        if isinstance(self.values, RangeGenerator):
            values = self.values.generate()
        else:
            values = self.values

        if isinstance(self.weights, RangeGenerator):
            weights = self.weights.generate()
        else:
            weights = self.weights

        if weights is not None:
            weights_sum = sum(weights)
            probs = [p / weights_sum for p in weights]
        else:
            probs = None

        if probs is not None and len(probs) != len(values):
            raise ValueError(f'values and probs must be the same size: {probs} !! {values}')

        if len(values) == 0:
            raise ValueError(f"Values cannot be empty: {self.values}")

        generators = {
            DistributionType.CHOICE: RandomDistribution._choice,
            DistributionType.NORMAL: RandomDistribution._normal,
            DistributionType.NONCENTRAL_CHISQUARE: RandomDistribution._noncentral_chisquare,
            DistributionType.UNIFORM: RandomDistribution._uniform,
            DistributionType.MIXED: RandomDistribution._mixed,
        }

        gen = generators.get(self.distribution_type)
        if gen is None:
            raise ValueError(f"Unsupported distribution type: {self.distribution_type}")

        return gen(size, values, probs)

    def get_values(self):
        """Return a list of values used to generate a random sequence."""
        if self.distribution_type == DistributionType.MIXED:
            result = []
            for child in self.values:
                result.append(child.get_values())
            return list(chain.from_iterable(result))

        if isinstance(self.values, RangeGenerator):
            return self.values.generate()

        return self.values

    @staticmethod
    def _choice(size: int, values: Sequence[TVar], probs: Sequence[float]):
        if probs is None:
            raise ValueError("props must be specified for choice distribution")
        return list(_rng.choice(a=values, size=size, p=probs))

    @staticmethod
    def _normal(size: int, values: Sequence[TVar], _: Sequence[float]):
        # In according to the 68-95-99.7 rule 99.7% of values lie within three standard deviations of the mean.
        # Therefore, if we define stddev as `len(values) / 6` 99.7% of the values will lie within our `values` array bounds.
        # We define stddev as `len(values) / 6` to increase make sure that almost all values are
        # withing the boundaries and we don't have to cut the index too often.
        mean = len(values) / 2
        stddev = len(values) / 6.5

        def get_value(index):
            # We need to consider how to deal with the values which lie outside of the boundaries.
            # Perhaps, regenerate such values?
            index = int(index)
            if index < 0:
                index = 0
            elif index >= len(values):
                index = len(values) - 1
            return values[index]

        return [get_value(n) for n in _rng.normal(loc=mean, scale=stddev, size=size)]

    @staticmethod
    def _noncentral_chisquare(size: int, values: Sequence[TVar], _: Sequence[float]):
        # Define `df` and `nonc` parameters in a way to minimize chances that generated values are
        # out of bounds of the `values` array.
        df = len(values) / 10
        nonc = len(values) / 3.5

        def get_value(index):
            # We need to consider how to deal with the values which lie outside of the boundaries.
            # Perhaps, regenerate such values?
            index = int(index)
            if index < 0:
                index = 0
            elif index >= len(values):
                index = len(values) - 1
            return values[index]

        return [get_value(n) for n in _rng.noncentral_chisquare(df=df, nonc=nonc, size=size)]

    @staticmethod
    def _uniform(size: int, values: Sequence[TVar], _: Sequence[float]):
        def get_value(index):
            index = int(index)
            return values[index]

        return [get_value(n) for n in _rng.uniform(low=0, high=len(values), size=size)]

    @staticmethod
    def _mixed(size: int, children: Sequence[RandomDistribution], probs: Sequence[float]):
        if probs is None:
            raise ValueError("props must be specified for mixed distribution")

        result = []
        for child_distr, prob in zip(children, probs):
            if not isinstance(child_distr, RandomDistribution):
                raise ValueError(
                    "children must be of type RandomDistribution for mixed distribution")
            child_size = int(size * prob)
            result.append(child_distr.generate(child_size))

        return list(chain.from_iterable(result))


@dataclass
class ArrayRandomDistribution(RandomDistribution):
    """Produces random array sequence of the specified values with the specified distribution."""

    lengths_distr: RandomDistribution
    value_distr: RandomDistribution

    def __init__(self, lengths_distr: RandomDistribution, value_distr: RandomDistribution):
        self.lengths_distr = lengths_distr
        self.value_distr = value_distr

    def generate(self, size: int):
        """Generate random array sequence of the given size."""
        arrays = []
        lengths = self.lengths_distr.generate(size)

        for length in lengths:
            if not isinstance(length, int):
                raise ValueError("length must be an int for array generation")
            values = self.value_distr.generate(length)
            arrays.append(values)
        return arrays


@dataclass
class DocumentRandomDistribution(RandomDistribution):
    """Produces random document sequence of the specified values with the specified distribution."""

    number_of_fields_distr: RandomDistribution
    fields_distr: RandomDistribution
    field_to_distribution: dict

    def __init__(self, number_of_fields_distr: RandomDistribution, fields_distr: RandomDistribution,
                 field_to_distribution: dict):
        self.number_of_fields_distr = number_of_fields_distr
        self.fields_distr = fields_distr
        self.field_to_distribution = field_to_distribution

        for field in self.get_fields():
            if field not in self.field_to_distribution:
                raise ValueError("Must provide a RandomDistribution for each field")

    def generate(self, size: int):
        """Generate random document sequence of the given size."""
        docs = []
        nums = self.number_of_fields_distr.generate(size)
        field_to_values = {}

        # Pre-generate values for each field with corresponding distribution.
        # Note that not all values generated would be used because the number of fields of a document is randomly generated as well.
        for field in self.get_fields():
            field_to_values[field] = self.field_to_distribution[field].generate(size)

        idx = 0
        for idx, num in enumerate(nums):
            doc = {}
            if not isinstance(num, int):
                raise ValueError("the number of fields must be an int for document generation")

            field_names = self.fields_distr.generate(num)
            for field in field_names:
                doc[field] = field_to_values[field][idx]

            docs.append(doc)

        return docs

    def get_fields(self):
        """Return a list of field names used to generate a random document."""
        return self.fields_distr.get_values()


if __name__ == '__main__':
    from collections import Counter

    def print_distr(title, distr, size=10000):
        """Print distribution."""
        print(f'\n{title}\n')
        rs = distr.generate(size)
        has_arrays = any(isinstance(elem, list) for elem in rs)
        has_dict = any(isinstance(elem, dict) for elem in rs)

        if not has_arrays and not has_dict:
            counter = Counter(rs)
            for value in distr.get_values():
                count = counter[value]
                if isinstance(value, float):
                    print(f'{value:.2f}\t{count}\t{(count//10)*"*"}')
                else:
                    print(f'{value}\t{count}\t{(count//10)*"*"}')
        else:
            for elem in rs:
                print(elem)

    choice = RandomDistribution.choice(values=['pooh', 'rabbit', 'piglet', 'Chris'],
                                       weights=[0.5, 0.1, 0.1, 0.3])
    print_distr("Choice", choice, 1000)

    string_generator = RangeGenerator(data_type=DataType.STRING, interval_begin='hello_a',
                                      interval_end='hello__')
    str_normal = RandomDistribution.normal(string_generator)
    print_distr("Normal for strings", str_normal)

    int_noncentral_chisquare = RandomDistribution.noncentral_chisquare(list(range(1, 30)))
    print_distr("Noncentral Chisquare for integers", int_noncentral_chisquare)

    float_uniform = RandomDistribution.uniform(RangeGenerator(DataType.FLOAT, 0.1, 10.0, 0.37))
    print_distr("Uniform for floats", float_uniform)

    str_chisquare2 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "aa", "ba"))
    str_normal2 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "ap", "bp"))
    mixed = RandomDistribution.mixed(children=[float_uniform, str_chisquare2, str_normal2],
                                     weight=[0.3, 0.5, 0.2])
    print_distr("Mixed", mixed, 20_000)

    int_normal = RandomDistribution.normal(RangeGenerator(DataType.INTEGER, 2, 10))

    arr_distr = ArrayRandomDistribution(int_normal, mixed)
    print_distr("Mixed Arrays", arr_distr, 100)

    mixed_with_arrays = RandomDistribution.mixed(children=[float_uniform, str_normal2, arr_distr],
                                                 weight=[0.3, 0.2, 0.5])
    nested_arr_distr = ArrayRandomDistribution(int_normal, mixed_with_arrays)

    print_distr("Mixed Nested Arrays", nested_arr_distr, 100)

    simple_doc_distr = DocumentRandomDistribution(
        RandomDistribution.normal(RangeGenerator(DataType.INTEGER, 1, 2)),
        RandomDistribution.uniform(["obj"]), {"obj": int_normal})

    field_name_choice = RandomDistribution.uniform(['a', 'b', 'c', 'd', 'e', 'f'])

    field_to_distr = {
        'a': int_normal, 'b': str_normal, 'c': mixed, 'd': arr_distr, 'e': nested_arr_distr,
        'f': simple_doc_distr
    }
    nested_doc_distr = DocumentRandomDistribution(
        RandomDistribution.normal(RangeGenerator(DataType.INTEGER, 0, 7)), field_name_choice,
        field_to_distr)

    print_distr("Nested Document generation", nested_doc_distr, 100)