Source code for snsynth.quail

import logging
import warnings

import numpy as np
import pandas as pd

from snsynth.base import Synthesizer

logger = logging.getLogger(__name__)


[docs]class QUAILSynthesizer(Synthesizer):
    """
    Quailified Architecture to Improve Labeling.
    Divide epsilon in a known classification task
    between a differentially private synthesizer and
    classifier. Train DP classifier on real, fit DP synthesizer
    to features (excluding the target label),
    and use synthetic data from the DP synthesizer with
    the DP classifier to create artificial labels. Produces
    complete synthetic data.

    More information here:
    Differentially Private Synthetic Data: Applied Evaluations and Enhancements
    https://arxiv.org/abs/2011.05537

    :param epsilon: Total epsilon used across the DP Synthesizer and DP Classifier
    :type epsilon: float
    :param dp_synthesizer: A function that returns an instance of a DP Synthesizer
        for a specified epsilon value
    :type dp_synthesizer: function (epsilon) -> SDGYMBaseSynthesizer
    :param dp_classifier: A function that returns an instance of a DP Classifier
        for a specified epsilon value
    :type dp_classifier: function (epsilon) -> classifier
    :param target: The column name of the target column
    :type target: str
    :param test_size: Percent of the data that should be used for the test set,
        defaults to 0.2
    :type test_size: float, optional
    :param seed: Seed for controlling randomness for testing, defaults to None
    :type seed: int, optional
    :param eps_split: Percent of epsilon used for the classifier.
        1 - eps_split is used for the Synthesizer., defaults to 0.9
    :type eps_split: float, optional
    """
    def __init__(
        self,
        epsilon,
        dp_synthesizer,
        dp_classifier,
        target,
        test_size=0.2,
        seed=None,
        eps_split=0.9,
    ):
        """
        Quailified Architecture to Improve Labeling.
        Divide epsilon in a known classification task
        between a differentially private synthesizer and
        classifier. Train DP classifier on real, fit DP synthesizer
        to features (excluding the target label),
        and use synthetic data from the DP synthesizer with
        the DP classifier to create artificial labels. Produces
        complete synthetic data.

        More information here:
        Differentially Private Synthetic Data: Applied Evaluations and Enhancements
        https://arxiv.org/abs/2011.05537

        :param epsilon: Total epsilon used across the DP Synthesizer and DP Classifier
        :type epsilon: float
        :param dp_synthesizer: A function that returns an instance of a DP Synthesizer
            for a specified epsilon value
        :type dp_synthesizer: function (epsilon) -> SDGYMBaseSynthesizer
        :param dp_classifier: A function that returns an instance of a DP Classifier
            for a specified epsilon value
        :type dp_classifier: function (epsilon) -> classifier
        :param target: The column name of the target column
        :type target: str
        :param test_size: Percent of the data that should be used for the test set,
            defaults to 0.2
        :type test_size: float, optional
        :param seed: Seed for controlling randomness for testing, defaults to None
        :type seed: int, optional
        :param eps_split: Percent of epsilon used for the classifier.
            1 - eps_split is used for the Synthesizer., defaults to 0.9
        :type eps_split: float, optional
        """
        self.epsilon = epsilon
        self.eps_split = eps_split
        self.dp_synthesizer = dp_synthesizer
        self.dp_classifier = dp_classifier
        self.target = target
        self.test_size = test_size
        self.seed = seed

        # Model
        self.private_model = None
        self.private_synth = None

        # Pandas check
        self.pandas = False
        self.pd_cols = None
        self.pd_index = None

    def fit(
        self,
        data,
        categorical_columns=tuple(),
        ordinal_columns=tuple(),
        transformer=None,
        continuous_columns=None,
        verbose=None,
        preprocessor_eps=0.0,
        nullable=False,
    ):
        """
        Takes a dataset and fits the synthesizer/learning model to it, using the epsilon split
        specified in the init.

        :param data: Data
        :type data: pd.DataFrame or np.array
        """
        if verbose is not None:
            warnings.warn("verbose is deprecated. Use logging.setLevel instead")

        from sklearn.model_selection import train_test_split
        from sklearn.metrics import classification_report
        from sklearn.metrics import accuracy_score

        if isinstance(data, pd.DataFrame):
            self.pandas = True
            for col in data.columns:
                data[col] = pd.to_numeric(data[col], errors="ignore")
            self.data = data
            self.pd_cols = data.columns
            self.pd_index = data.index
        else:
            raise ("Only pandas dataframes for data as of now.")

        private_features = data.loc[:, data.columns != self.target]
        private_target = data.loc[:, data.columns == self.target]
        x_train, x_test, y_train, y_test = train_test_split(
            private_features,
            private_target,
            test_size=self.test_size,
            random_state=self.seed,
        )

        # Here we train a differentially private model on the real
        # data. We report on the accuracy for now to give a sense of
        # the upper bound on performance in the sampling step.
        self.private_model = self.dp_classifier(epsilon=(self.epsilon * self.eps_split))
        self.private_model.fit(x_train, y_train.values.ravel())
        predictions = self.private_model.predict(x_test)
        self.class_report = classification_report(
            np.ravel(y_test), predictions, labels=np.unique(predictions)
        )
        self.target_accuracy = accuracy_score(np.ravel(y_test), predictions)
        log_level = logger.level
        if verbose:
            log_level = logging.INFO

        logging.log(log_level, "Internal model report: ")
        logging.log(log_level, self.class_report)
        logging.log(log_level, self.target_accuracy)

        # We use the features in our synthesis.
        self.private_synth = self.dp_synthesizer(
            epsilon=(self.epsilon * (1 - self.eps_split))
        )
        self.private_synth.fit(
            data=private_features,
            categorical_columns=categorical_columns,
            ordinal_columns=ordinal_columns,
            transformer=transformer,
            continuous_columns=continuous_columns,
            preprocessor_eps=preprocessor_eps,
            nullable=nullable,
        )

        if hasattr(self.private_model, "coef_"):
            logging.log(log_level, self.private_model.coef_)

        if hasattr(self.private_model, "intercept_"):
            logging.log(log_level, self.private_model.intercept_)

        if hasattr(self.private_model, "classes_"):
            logging.log(log_level, self.private_model.classes_)

    def sample(self, samples):
        """
        Sample from the synthesizer model.

        :param samples: The number of samples to create
        :type samples: int
        :return: A dataframe of length samples
        :rtype: pd.Dataframe
        """
        sampled_features = self.private_synth.sample(samples)
        y_values = self.private_model.predict(sampled_features)

        sampled_features[self.target] = y_values
        return sampled_features