try:
# %tensorflow_version only exists in Colab.
%tensorflow_version 2.x
except Exception:
pass
%%capture
%%bash
pip install -U tensorflow-text
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model
Set default options for modules
pd.set_option('display.max_colwidth', -1)
num_gpus_available = len(tf.config.experimental.list_physical_devices('GPU'))
print("Num GPUs Available: ", num_gpus_available)
assert num_gpus_available > 0
config = {
'seed': 31,
'batch_size': 64,
'epochs': 10,
'max_seq_len': 128
}
Download the pretrained BERT model
BERT_URL = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(BERT_URL, trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
print(f'BERT vocab is stored at : {vocab_file}')
print(f'BERT model is case sensitive: {do_lower_case}')
Load the vocab file that corresponds to the pretrained BERT
def load_vocab(vocab_file):
"""Load a vocabulary file into a list."""
vocab = []
with tf.io.gfile.GFile(vocab_file, "r") as reader:
while True:
token = reader.readline()
if not token: break
token = token.strip()
vocab.append(token)
return vocab
vocab = load_vocab(vocab_file)
Use BERT vocab to create a word to index lookup table
def create_vocab_table(vocab, num_oov=1):
"""Create a lookup table for a vocabulary"""
vocab_values = tf.range(tf.size(vocab, out_type=tf.int64), dtype=tf.int64)
init = tf.lookup.KeyValueTensorInitializer(keys=vocab, values=vocab_values, key_dtype=tf.string, value_dtype=tf.int64)
vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov, lookup_key_dtype=tf.string)
return vocab_table
vocab_lookup_table = create_vocab_table(vocab)
Use BERT vocab to create a index to word lookup table
def create_index2word(vocab):
# Create a lookup table for a index to token
vocab_values = tf.range(tf.size(vocab, out_type=tf.int64), dtype=tf.int64)
init = tf.lookup.KeyValueTensorInitializer(keys=vocab_values, values=vocab)
return tf.lookup.StaticHashTable(initializer=init, default_value=tf.constant('unk'), name="index2word")
index2word = create_index2word(vocab)
Check out the indices for the following tokens
vocab_lookup_table.lookup(tf.constant(['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']))
Check out the token corresponding to an index
index2word.lookup(tf.constant([0], dtype='int64')).numpy()
Create a BERT tokenizer using TF Text
tokenizer = text.BertTokenizer(
vocab_lookup_table,
token_out_type=tf.int64,
lower_case=do_lower_case
)
Lookup for the BERT token IDs for padding and start/end of sentence.
PAD_ID = vocab_lookup_table.lookup(tf.constant('[PAD]')) # padding token
CLS_ID = vocab_lookup_table.lookup(tf.constant('[CLS]')) # class token
SEP_ID = vocab_lookup_table.lookup(tf.constant('[SEP]')) # sequence separator token
Define the logic to preprocess data and format it as required by BERT
def preprocess(record):
review, label = record['text'], record['label']
# process review to calculate BERT input
ids, mask, type_ids = preprocess_bert_input(review)
return (ids, mask, type_ids), label
def preprocess_bert_input(review):
# calculate tokens ID
ids = tokenize_text(review, config['max_seq_len'])
# calculate mask
mask = tf.cast(ids > 0, tf.int64)
mask = tf.reshape(mask, [-1, config['max_seq_len']])
# calculate tokens type ID
zeros_dims = tf.stack(tf.shape(mask))
type_ids = tf.fill(zeros_dims, 0)
type_ids = tf.cast(type_ids, tf.int64)
return (ids, mask, type_ids)
def tokenize_text(review, seq_len):
# convert text into token ids
tokens = tokenizer.tokenize(review)
# flatten the output ragged tensors
tokens = tokens.merge_dims(1, 2)[:, :seq_len]
# Add start and end token ids to the id sequence
start_tokens = tf.fill([tf.shape(review)[0], 1], CLS_ID)
end_tokens = tf.fill([tf.shape(review)[0], 1], SEP_ID)
tokens = tokens[:, :seq_len - 2]
tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1)
# truncate sequences greater than MAX_SEQ_LEN
tokens = tokens[:, :seq_len]
# pad shorter sequences with the pad token id
tokens = tokens.to_tensor(default_value=PAD_ID)
pad = seq_len - tf.shape(tokens)[1]
tokens = tf.pad(tokens, [[0, 0], [0, pad]], constant_values=PAD_ID)
# and finally reshape the word token ids to fit the output
# data structure of TFT
return tf.reshape(tokens, [-1, seq_len])
Download the dataset from TF Hub and process it
train_ds, valid_ds = tfds.load('imdb_reviews', split=['train', 'test'], shuffle_files=True)
train_ds = train_ds.shuffle(1024).batch(config['batch_size']).prefetch(tf.data.experimental.AUTOTUNE)
valid_ds = valid_ds.shuffle(1024).batch(config['batch_size']).prefetch(tf.data.experimental.AUTOTUNE)
train_ds, valid_ds = train_ds.map(preprocess), valid_ds.map(preprocess)
input_ids = Input(shape=(config['max_seq_len'],), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(config['max_seq_len'],), dtype=tf.int32, name="input_mask")
input_type_ids = Input(shape=(config['max_seq_len'],), dtype=tf.int32, name="input_type_ids")
pooled_output, sequence_output = bert_layer([input_ids, input_mask, input_type_ids])
drop_out = Dropout(0.3, name="dropout")(pooled_output)
output = Dense(1, activation='sigmoid', name="linear")(drop_out)
model = Model(inputs=[input_ids, input_mask, input_type_ids], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()
model.fit(train_ds, validation_data=valid_ds, epochs=config['epochs'])
test_text_ds = tfds.load('imdb_reviews', split='unsupervised', shuffle_files=True)
test_ds = test_text_ds.shuffle(1024).batch(config['batch_size']).prefetch(tf.data.experimental.AUTOTUNE)
test_ds = test_ds.map(preprocess)
Check how test text is tokenized
test_text = [record['text'].numpy() for record in test_text_ds.take(10)]
ids = tokenize_text(test_text, config['max_seq_len'])
tokens = [b' '.join(tokens_array) for tokens_array in index2word.lookup(ids).numpy()]
pd.DataFrame({'tokens': tokens})
Run prediction on test reviews
result = model.predict(test_ds)
result.shape
result_df = pd.DataFrame({'label': tf.squeeze(result[:10]).numpy(), 'text': test_text})
result_df.head()