Source code for snsynth.transform.anonymization

from .base import ColumnTransformer
from .definitions import ColumnType
from faker import Faker


[docs]class AnonymizationTransformer(ColumnTransformer): """ Transformer that can be used to anonymize personally identifiable information (PII) or other values. By default, the existing values are discarded during transformation and not used by a synthesizer. During inverse transformation new values will be generated according to the specified ``fake``. If ``fake_inbound`` is true, the new values will be injected during transformation and passed through on inverse. This might be useful for e.g. operation in a ChainTransformer. Beware that the provided ``fake`` is called once to verify that the provided (keyword) arguments are valid. :param fake: Text reference to Faker method (e.g. 'email') or custom callable :type fake: str or callable, required :param args: Arguments for the method :type args: args, optional :param faker_setup: Dictionary with keyword arguments for Faker initialization e.g. {'locale': 'de_DE'} :type faker_setup: dict, optional :param fake_inbound: Defaults to False. :type fake_inbound: bool, optional :param kwargs: Keyword arguments for the method :type kwargs: kwargs, optional """ def __init__(self, fake, *args, faker_setup=None, fake_inbound=False, **kwargs): self.fake_inbound = fake_inbound super().__init__() if isinstance(fake, str): # assume this references a Faker builtin fake = self._get_faker_builtin(fake, faker_setup, *args, **kwargs) self.fake = fake self.args = args self.kwargs = kwargs # verify that the provided arguments are valid try: self._generate_fake_data() except TypeError as e: raise ValueError( f"Provided arguments {args} and {kwargs} are invalid for `fake` {fake}" ) from e def _get_faker_builtin(self, fake, faker_setup, *args, **kwargs): """ Creates a Faker instance and verifies that the given method is available. :param fake: Text reference to Faker method :type fake: str, required :param faker_setup: Dictionary with keyword arguments for initializing Faker e.g. {'locale': 'de_DE'} :type faker_setup: dict, optional :param args: Arguments for the method :type args: args, optional :param kwargs: Keyword arguments for the method :type kwargs: kwargs, optional :return: Actual Faker method :rtype: callable """ # initialize Faker with provided setup or default if isinstance(faker_setup, dict): try: self.faker = Faker(**faker_setup) except Exception as e: raise ValueError( f"Provided `faker_setup` {faker_setup} is invalid" ) from e else: self.faker = Faker() # verify that the provided fake is available try: fake_builtin = getattr(self.faker, fake) except AttributeError as e: raise ValueError(f"Provided `fake` {fake} is not available in Faker") from e return fake_builtin @property def output_type(self): return ColumnType.UNBOUNDED @property def cardinality(self): return [None] def _fit(self, _): pass def _clear_fit(self): self._fit_complete = True self.output_width = 1 if self.fake_inbound else 0 def _generate_fake_data(self): return self.fake(*self.args, **self.kwargs) def _transform(self, _): if self.fake_inbound: return self._generate_fake_data() else: return None def _inverse_transform(self, val): if self.fake_inbound: return val else: return self._generate_fake_data() def transform(self, data, idx=None): if idx is None: return [self._transform(val) for val in data] else: return [row[:idx] + row[idx + 1 :] for row in data] def inverse_transform(self, data, idx=None): if idx is None: return [self._inverse_transform(val) for val in data] else: return [ row[:idx] + (self._inverse_transform(None),) + row[idx:] for row in data ]