In TF 2.3, Keras adds new preprocessing layers for image, text and strucured data. The following notebook explores those new layers for dealing with Structured data.

For a complete example of how to use the new preprocessing layer for Structured data check the Keras example - link.

Structured data

Generate some random data for playing with and seeing what is the output of the preprocessing layers.

xdf = pd.DataFrame({
  'categorical_string': ['LOW', 'HIGH', 'HIGH', 'MEDIUM'],
  'categorical_integer_1': [1, 0, 1, 0],
  'categorical_integer_2': [1, 2, 3, 4],
  'numerical_1': [2.3, 0.2, 1.9, 5.8],
  'numerical_2': [16, 32, 8, 60]
})
ydf = pd.DataFrame({'target': [0, 0, 0, 1]})
ds = tf.data.Dataset.from_tensor_slices((dict(xdf), ydf))
for x, y in ds.take(1):
  print('X:', x)
  print('y:', y)
X: {'categorical_string': <tf.Tensor: shape=(), dtype=string, numpy=b'cat1'>, 'categorical_integer_1': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'categorical_integer_2': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'numerical_1': <tf.Tensor: shape=(), dtype=float64, numpy=2.3>, 'numerical_2': <tf.Tensor: shape=(), dtype=int64, numpy=16>}
y: tf.Tensor([0], shape=(1,), dtype=int64)
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

Pre-processing Numercial columns

Preprocessing helper function to encode numercial features, e.g. 0.1, 0.2, etc.

def create_numerical_encoder(dataset, name):
    # Create a Normalization layer for our feature
    normalizer = Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    return normalizer
# Apply normalization to a numerical feature
normalizer = create_numerical_encoder(ds, 'numerical_1')
normalizer.apply(xdf[name].values)
<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[-0.7615536],
       [-1.2528784],
       [-0.7615536],
       [-1.2528784]], dtype=float32)>

Pre-processing Integer categorical columns

Preprocessing helper function to encode integer categorical features, e.g. 1, 2, 3

def create_integer_categorical_encoder(dataset, name):
    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the space of possible indices
    encoder.adapt(feature_ds)

    return encoder
# Apply one-hot encoding to an integer categorical feature
encoder1 = create_integer_categorical_encoder(ds, 'categorical_integer_1')
encoder1.apply(xdf['categorical_integer_1'].values)
<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)>
# Apply one-hot encoding to an integer categorical feature
encoder2 = create_integer_categorical_encoder(ds, 'categorical_integer_2')
encoder2.apply(xdf['categorical_integer_2'].values)
<tf.Tensor: shape=(4, 5), dtype=float32, numpy=
array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]], dtype=float32)>

Pre-processing String categorical columns

Preprocessing helper function to encode string categorical features, e.g. LOW, HIGH, MEDIUM.

This will applying the following to the input feature:

  1. Create a token to index lookup table
  2. Apply one-hot encoding to the tokens indices
def create_string_categorical_encoder(dataset, name):
    # Create a StringLookup layer which will turn strings into integer indices
    index = StringLookup()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    index.adapt(feature_ds)

    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")

    # Prepare a dataset of indices
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices
    encoder.adapt(feature_ds)

    return index, encoder
# Apply one-hot encoding to an integer categorical feature
indexer, encoder3 = create_string_categorical_encoder(ds, 'categorical_string')
# Turn the string input into integer indices
indices = indexer.apply(xdf['categorical_string'].values)
# Apply one-hot encoding to our indices
encoder3.apply(indices)
<tf.Tensor: shape=(4, 5), dtype=float32, numpy=
array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.]], dtype=float32)>

Notice that the string categorical column was hot encoded into 5 tokens whereas in the input dataframe there is only 3 unique values. This is because the indexer adds 2 more tokens. See the vocabulary:

indexer.get_vocabulary()
['', '[UNK]', 'cat2', 'cat3', 'cat1']