In TF 2.3, Keras adds new preprocessing layers for image, text and strucured data. The following notebook explores those new layers for dealing with Structured data.
For a complete example of how to use the new preprocessing layer for Structured data check the Keras example - link.
xdf = pd.DataFrame({
'categorical_string': ['LOW', 'HIGH', 'HIGH', 'MEDIUM'],
'categorical_integer_1': [1, 0, 1, 0],
'categorical_integer_2': [1, 2, 3, 4],
'numerical_1': [2.3, 0.2, 1.9, 5.8],
'numerical_2': [16, 32, 8, 60]
})
ydf = pd.DataFrame({'target': [0, 0, 0, 1]})
ds = tf.data.Dataset.from_tensor_slices((dict(xdf), ydf))
for x, y in ds.take(1):
print('X:', x)
print('y:', y)
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
Preprocessing helper function to encode numercial features, e.g. 0.1, 0.2, etc.
def create_numerical_encoder(dataset, name):
# Create a Normalization layer for our feature
normalizer = Normalization()
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the statistics of the data
normalizer.adapt(feature_ds)
return normalizer
# Apply normalization to a numerical feature
normalizer = create_numerical_encoder(ds, 'numerical_1')
normalizer.apply(xdf[name].values)
Preprocessing helper function to encode integer categorical features, e.g. 1, 2, 3
def create_integer_categorical_encoder(dataset, name):
# Create a CategoryEncoding for our integer indices
encoder = CategoryEncoding(output_mode="binary")
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the space of possible indices
encoder.adapt(feature_ds)
return encoder
# Apply one-hot encoding to an integer categorical feature
encoder1 = create_integer_categorical_encoder(ds, 'categorical_integer_1')
encoder1.apply(xdf['categorical_integer_1'].values)
# Apply one-hot encoding to an integer categorical feature
encoder2 = create_integer_categorical_encoder(ds, 'categorical_integer_2')
encoder2.apply(xdf['categorical_integer_2'].values)
Preprocessing helper function to encode string categorical features, e.g. LOW, HIGH, MEDIUM.
This will applying the following to the input feature:
- Create a token to index lookup table
- Apply one-hot encoding to the tokens indices
def create_string_categorical_encoder(dataset, name):
# Create a StringLookup layer which will turn strings into integer indices
index = StringLookup()
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the set of possible string values and assign them a fixed integer index
index.adapt(feature_ds)
# Create a CategoryEncoding for our integer indices
encoder = CategoryEncoding(output_mode="binary")
# Prepare a dataset of indices
feature_ds = feature_ds.map(index)
# Learn the space of possible indices
encoder.adapt(feature_ds)
return index, encoder
# Apply one-hot encoding to an integer categorical feature
indexer, encoder3 = create_string_categorical_encoder(ds, 'categorical_string')
# Turn the string input into integer indices
indices = indexer.apply(xdf['categorical_string'].values)
# Apply one-hot encoding to our indices
encoder3.apply(indices)
Notice that the string categorical column was hot encoded into 5 tokens whereas in the input dataframe there is only 3 unique values. This is because the indexer adds 2 more tokens. See the vocabulary:
indexer.get_vocabulary()