The next example was the Movie Review Sentiment Analysis. It did terribly with test data, in particular with 5 unseen, short reviews. I modified it by adding a bidirectional LSTM layer, added early stopping, and annoyingly had to bias my results by greater than 0.69 for “Good.” I also added code to use the locally buffered download data, instead of reloading from the site on every run (continued in reply to myself):
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
import pandas as pd
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(
monitor='val_loss', # Monitor validation loss
patience=3, # Stop after 3 epochs of no improvement
restore_best_weights=True # Restore the best model's weights after stopping
)
print(tf.__version__)
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_name = "aclImdb_v1"
dataset_dir_name = "aclImdb"
dataset_path = os.path.join(".", dataset_dir_name)
# Check if the dataset is already downloaded and extracted
if not os.path.exists(dataset_path):
print("Dataset not found locally, downloading...")
dataset = tf.keras.utils.get_file(dataset_name, url, untar=True, cache_dir='.', cache_subdir='')
else:
print("Dataset found locally, skipping download.")
dataset = os.path.join(".", dataset_dir_name)
dataset_dir = os.path.join(os.path.dirname(dataset), dataset_dir_name)
train_dir = os.path.join(dataset_dir, 'train')
# Remove the unsupervised data directory if it exists
remove_dir = os.path.join(train_dir, 'unsup')
if os.path.exists(remove_dir):
shutil.rmtree(remove_dir)
print(f"Dataset is ready at: {dataset_dir}")
batch_size = 32
seed = 42
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='validation',
seed=seed)
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
'aclImdb/test',
batch_size=batch_size)
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')
return tf.strings.regex_replace(stripped_html,
'[%s]' % re.escape(string.punctuation),
'')
max_features = 10000
sequence_length = 250
vectorize_layer = layers.TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
Sentement Analysis continued (1 of 2):
def vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return vectorize_layer(text), label
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
# retrieve a batch (of 32 reviews and labels) from the dataset
# text_batch, label_batch = next(iter(raw_train_ds))
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
embedding_dim = 16
model = tf.keras.Sequential([
layers.Embedding(max_features, embedding_dim),
layers.Bidirectional(layers.LSTM(units=64, return_sequences=True)),
layers.GlobalMaxPooling1D(),
layers.Dropout(0.5),
layers.Dense(64, activation='relu'),
layers.Dropout(0.5),
layers.Dense(1, activation='sigmoid')
])
model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
loss=losses.BinaryCrossentropy(),
metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)])
epochs = 30
history = model.fit(
train_ds,
validation_data=val_ds,
epochs=epochs,
callbacks=[early_stopping] # Add the early stopping callback
)
loss, accuracy = model.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)
history_dict = history.history
history_dict.keys()
acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()
Sentiment Analysis (2 of 2):
# Now we create a binary prediction using sigmoid activation
export_model = tf.keras.Sequential([
vectorize_layer,
model,
layers.Activation('sigmoid')
])
export_model.compile(
loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)
# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy, *all_else = export_model.evaluate(raw_test_ds)
print(accuracy)
examples = tf.constant([
"The movie was great! I really loved it. One of the best movies I ever saw.",
"The movie was okay. I have seen better, but I enjoyed it.",
"The movie was terrible. Hated it. Never have I seen such a bad film.",
"I liked the movie. It was entertaining, and I enjoyed it a lot.",
"The movie wasn't all that hot. I would not see it again. I give it a thumbs down."
])
results = export_model.predict(examples)
print(results)
display_results = [(example.numpy().decode("utf-8"), "Good" if value[0] > 0.69 else "Bad")
for example, value in zip(examples, results)]
# Convert the list to a pandas DataFrame
df = pd.DataFrame(display_results, columns=["Review", "Sentiment"])
# Print the table
print(df)
1 Like