Source code for snsynth.transform.label
from snsynth.transform.definitions import ColumnType
from .base import ColumnTransformer
import numpy as np
[docs]class LabelTransformer(ColumnTransformer):
"""Transforms categorical values into integer-indexed labels. Labels will be sorted if possible,
so that the output can be used as an ordinal. The indices will be 0-based.
:param nullable: If null values are expected, a second output will be generated indicating null.
"""
def __init__(self, nullable=True):
super().__init__()
self.nullable = nullable
@property
def output_type(self):
return ColumnType.CATEGORICAL
@property
def cardinality(self):
return [len(self.categories)]
def _fit(self, val):
if isinstance(val, float) and np.isnan(val):
val = None
if val not in self.labels:
self.labels[val] = self.category
self.categories[self.category] = val
self.category += 1
self.output_width = 1
def _fit_finish(self):
self._fit_complete = True
# try sorting the categories so this can be used in ordinals
vals = [v for v in self.labels.keys() if v is not None]
val_types = set([type(v) for v in vals])
if len(val_types) > 1:
return
sorted_labels = sorted(vals)
self.labels = {}
self.categories = []
for i, label in enumerate(sorted_labels):
self.labels[label] = i
self.categories.append(label)
if self.nullable:
idx = len(self.categories)
self.labels[None] = idx
self.categories.append(None)
return
def _clear_fit(self):
self._reset_fit()
self.labels = {}
self.categories = {}
self.category = 0
def _transform(self, val):
if isinstance(val, float) and np.isnan(val):
val = None
return self.labels[val]
def _inverse_transform(self, val):
if val is None and self.nullable:
return None
return self.categories[val]