Source code for snsynth.aggregate_seeded.aggregate_seeded

import pandas as pd
import numpy as np

from pacsynth import (
    DpAggregateSeededSynthesizer,
    DpAggregateSeededParametersBuilder,
    AccuracyMode,
    FabricationMode,
)
from pacsynth import Dataset as AggregateSeededDataset
from snsynth.base import Synthesizer


"""
Wrapper for Private Aggregate Seeded Synthesizer from pac-synth:
https://pypi.org/project/pac-synth/.

A differentially-private synthesizer that relies on DP Marginals to
build synthetic data. It will compute DP Marginals (called aggregates)
for your dataset up to and including a specified reporting length, and
synthesize data based on the computed aggregated counts.

For documentation please refer to:
    - https://github.com/microsoft/synthetic-data-showcase
    - https://github.com/microsoft/synthetic-data-showcase/tree/main/docs/dp
"""


[docs]class AggregateSeededSynthesizer(Synthesizer):
    """
    SmartNoise class wrapper for Private Aggregate Seeded Synthesizer from pac-synth.
    Works with Pandas data frames, raw data and follows norms set by other SmartNoise synthesizers.

    :param reporting_length: The maximum length of the combinations to be synthesized.  For example,
        if reporting length is 2, the synthesizer will compute DP marginals for all two-column combinations
        in the dataset.
    :type reporting_length: int
    :param epsilon: The privacy budget to be used for the synthesizer.
    :type epsilon: float
    :param delta: The delta value to be used for the synthesizer.  If set, should be small, in the range
        of 1/(n * sqrt(n)), where n is the approximate number of records in the dataset.
    :param percentile_percentage: Because the synthesizer computes multiple n-way marginals, each individual may
        affect multiple marginals.  The ``percentile_percentage`` can remove the influence of outliers to
        reduce sensitivity and improve the accuracy of the synthesizer.  For example, if ``percentile_percentage``
        is 99, the synthesizer will use a sensitivity that can accomodate 99% of the individuals, and will ensure
        that the records of the outlier 1% are sampled to conform to this sensitivity.
    :type percentile_percentage: int
    :param percentile_epsilon_proportion: The proportion of the epsilon budget to be used to estimate the
        percentile sensitivity.
    :type percentile_epsilon_proportion: float
    :param verbose: Show diagnostic information about the synthesizer's progress.
    :type verbose: bool

    See the `pac-synth documentation <https://github.com/microsoft/synthetic-data-showcase/blob/main/docs/dp/README.md>`_ 
        for more details about these and other hyperparameters.

    Reuses code and modifies it lightly from 
        `pac-synth <https://github.com/microsoft/synthetic-data-showcase/tree/main/packages/lib-pacsynth>`_.
    """

    def __init__(
        self,
        reporting_length=3,
        epsilon=4.0,
        delta=None,
        percentile_percentage=99,
        percentile_epsilon_proportion=0.01,
        accuracy_mode=AccuracyMode.prioritize_long_combinations(),
        number_of_records_epsilon_proportion=0.005,
        fabrication_mode=FabricationMode.uncontrolled(),
        empty_value="",
        use_synthetic_counts=False,
        weight_selection_percentile=95,
        aggregate_counts_scale_factor=None,
        verbose=False
    ):
        """
        Wrapper for Private Aggregate Seeded Synthesizer from pac-synth.

        For more information about the parameters run `help('pacsynth.DpAggregateSeededParametersBuilder')`.
        """
        self.epsilon = epsilon
        self.delta = delta
        self.reporting_length = reporting_length
        self.percentile_percentage = percentile_percentage
        self.percentile_epsilon_proportion = percentile_epsilon_proportion
        self.accuracy_mode = accuracy_mode
        self.number_of_records_epsilon_proportion = number_of_records_epsilon_proportion
        self.fabrication_mode = fabrication_mode
        self.empty_value = empty_value
        self.use_synthetic_counts = use_synthetic_counts
        self.weight_selection_percentile = weight_selection_percentile
        self.aggregate_counts_scale_factor = aggregate_counts_scale_factor
        self.verbose = verbose
        self.preprocessed = False
        self.build_synthesizer()

    def build_synthesizer(self):
        builder = (
            DpAggregateSeededParametersBuilder()
            .reporting_length(self.reporting_length)
            .epsilon(self.epsilon)
            .percentile_percentage(self.percentile_percentage)
            .percentile_epsilon_proportion(self.percentile_epsilon_proportion)
            .accuracy_mode(self.accuracy_mode)
            .number_of_records_epsilon_proportion(self.number_of_records_epsilon_proportion)
            .fabrication_mode(self.fabrication_mode)
            .empty_value(self.empty_value)
            .use_synthetic_counts(self.use_synthetic_counts)
            .weight_selection_percentile(self.weight_selection_percentile)
        )

        if self.aggregate_counts_scale_factor is not None:
            builder = builder.aggregate_counts_scale_factor(
                self.aggregate_counts_scale_factor
            )

        if self.delta is not None:
            builder = builder.delta(self.delta)

        self.reporting_length = self.reporting_length
        self.parameters = builder.build()
        self.synth = DpAggregateSeededSynthesizer(self.parameters)
        self.dataset = None
        self.pandas = False

    def fit(
        self, 
        data,
        *ignore,
        use_columns=None,
        sensitive_zeros=None,
        transformer=None,
        categorical_columns=None,
        ordinal_columns=None,
        continuous_columns=None,
        preprocessor_eps=0.0,
        nullable=False
        ):
        """
        Fit the synthesizer model on the data.

        This will compute the differently private aggregates used to
        synthesize data.

        All the columns are supposed to be categorical, non-categorical columns
        should be binned in advance.

        For more information run `help('pacsynth.Dataset')` and
        `help('pacsynth.DpAggregateSeededSynthesizer.fit')`.

        :param data: The data for fitting the synthesizer model.
        :type data: pd.DataFrame, list[list[str]], AggregateSeededDataset
        :param use_columns: List of column names to be used, defaults to None, meaning use all columns
        :type use_columns: list[str], optional
        :param sensitive_zeros: List of column names containing '0' that should not be turned into empty strings.
        :type sensitive_zeros: list[str], optional
        """

        before_eps = self.epsilon

        train_data = self._get_train_data(
            data,
            style='cube',
            transformer=transformer,
            categorical_columns=categorical_columns, 
            ordinal_columns=ordinal_columns, 
            continuous_columns=continuous_columns, 
            nullable=True,
            preprocessor_eps=preprocessor_eps
        )

        if self.epsilon != before_eps:
            # preprocessor changed epsilon, rebuild synthesizer
            self.build_synthesizer()

        if self._transformer is None:
            raise ValueError("We weren't able to fit a transformer to the data. Please check your data and try again.")

        if self._transformer.output_width > 0:
            colnames = ["column_{}".format(i) for i in range(len(train_data[0]))]
            data = [colnames] + [[str(v) for v in row] for row in train_data]

            assert sensitive_zeros is None, "sensitive zeros cannot be set with a transformer, please set transformer=NoTransformer()"
            assert use_columns is None, "use columns cannot be set with a transformer, please set transformer=NoTransformer()"

            sensitive_zeros = colnames

            cards = self._transformer.cardinality
            if any (c is None for c in cards):
                raise ValueError("The transformer appears to have some continuous columns. Please provide only categorical or ordinal.")

            dimensionality = np.prod(cards)
            if self.verbose:
                print(f"Fitting with {dimensionality} dimensions")


        if isinstance(data, list) and all(map(lambda row: isinstance(row, list), data)):
            self.dataset = AggregateSeededDataset(
                data, use_columns=use_columns, sensitive_zeros=sensitive_zeros
            )
            self.pandas = False
        elif isinstance(data, pd.DataFrame):
            self.dataset = AggregateSeededDataset.from_data_frame(
                data, use_columns=use_columns, sensitive_zeros=sensitive_zeros
            )
            self.pandas = True
        elif isinstance(data, AggregateSeededDataset):
            self.dataset = data
            self.pandas = False
        else:
            raise ValueError(
                "data should be either in raw format (List[List[]]) or a be pandas data frame (pd.DataFrame)"
            )

        self.synth.fit(self.dataset)

    def sample(self, samples=None):
        """
        Sample from the synthesizer model.

        This will sample records from the generated differentially private aggregates.

        If `samples` is `None`, the synthesizer will use all the available differentially
        private attributes counts to synthesize records
        (which will produce a number close to original number of records).

        For more information run `help('pacsynth.DpAggregateSeededSynthesizer.sample')`.

        :param samples: The number of samples to create
        :type samples: int, None
        :return: Generated data samples, the output type adjusts accordingly to the input data.
        :rtype: Dataframe, list[list[str]]
        """
        result = self.synth.sample(samples)

        if self._transformer is not None and self._transformer.output_width > 0:
            result = [[int(v) if v != '' else None for v in row] for row in result[1:]]
            result = self._transformer.inverse_transform(result)
            return result

        if self.pandas is True:
            result = AggregateSeededDataset.raw_data_to_data_frame(result)

        return result


    def get_sensitive_aggregates(
        self, combination_delimiter=";", reporting_length=None
        ):
        """
        Returns the aggregates for the sensitive dataset. For more information run `help('pacsynth.Dataset.get_aggregates')`.

        :param combination_delimiter: Combination delimiter to use, default to ';'
        :type combination_delimiter: str, optional
        :param reporting_length: Maximum length (inclusive) to compute attribute combinations for, defaults to the configured value in the synthesizer
        :type reporting_length: int, optional

        :return: A dictionary with the combination string representation as key and the combination count as value
        :rtype: dict[str, int]
        """
        if self.dataset is None:
            raise RuntimeError(
                "make sure 'fit' method has been successfully called first"
            )

        if reporting_length is None:
            reporting_length = self.reporting_length

        return self.dataset.get_aggregates(reporting_length, combination_delimiter)

    def get_dp_aggregates(self, combination_delimiter=";"):
        """
        Returns the aggregates for the sensitive dataset protected with differential privacy.

        For more information run `help('pacsynth.DpAggregateSeededSynthesizer.get_dp_aggregates')`.

        :param combination_delimiter: Combination delimiter to use, default to ';'
        :type combination_delimiter: str, optional
        :return: A dictionary with the combination string representation as key and the combination count as value
        :rtype: dict[str, int]
        """
        return self.synth.get_dp_aggregates(combination_delimiter)

    def get_dp_number_of_records(self):
        """
        Gets the differentially private number of records computed with the `.fit` method.

        This is different than the number of records specified in the sample method, synthesized
        in the synthetic data. This refers to the differentially private protected number of records
        in original sensitive dataset (Laplacian noise added).

        For more information run `help('pacsynth.DpAggregateSeededSynthesizer.get_dp_number_of_records')`.

        :return: Number of sensitive records protect with differential privacy
        :rtype: int
        """
        return self.synth.get_dp_number_of_records()