Next in the tutorial - Sentiment Analysis

Kelly_McTiernan · August 24, 2024, 8:48pm

The next example was the Movie Review Sentiment Analysis. It did terribly with test data, in particular with 5 unseen, short reviews. I modified it by adding a bidirectional LSTM layer, added early stopping, and annoyingly had to bias my results by greater than 0.69 for “Good.” I also added code to use the locally buffered download data, instead of reloading from the site on every run (continued in reply to myself):

import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

import pandas as pd

from tensorflow.keras import layers
from tensorflow.keras import losses

from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',       # Monitor validation loss
    patience=3,               # Stop after 3 epochs of no improvement
    restore_best_weights=True # Restore the best model's weights after stopping
)

print(tf.__version__)

url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_name = "aclImdb_v1"
dataset_dir_name = "aclImdb"
dataset_path = os.path.join(".", dataset_dir_name)

# Check if the dataset is already downloaded and extracted
if not os.path.exists(dataset_path):
    print("Dataset not found locally, downloading...")
    dataset = tf.keras.utils.get_file(dataset_name, url, untar=True, cache_dir='.', cache_subdir='')
else:
    print("Dataset found locally, skipping download.")
    dataset = os.path.join(".", dataset_dir_name)

dataset_dir = os.path.join(os.path.dirname(dataset), dataset_dir_name)
train_dir = os.path.join(dataset_dir, 'train')

# Remove the unsupervised data directory if it exists
remove_dir = os.path.join(train_dir, 'unsup')
if os.path.exists(remove_dir):
    shutil.rmtree(remove_dir)

print(f"Dataset is ready at: {dataset_dir}")

batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size)

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '
', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

Kelly_McTiernan · August 24, 2024, 9:30pm

Sentement Analysis continued (1 of 2):

def vectorize_text(text, label):
   text = tf.expand_dims(text, -1)
   return vectorize_layer(text), label
    
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

# retrieve a batch (of 32 reviews and labels) from the dataset
# text_batch, label_batch = next(iter(raw_train_ds))

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

embedding_dim = 16

model = tf.keras.Sequential([
    layers.Embedding(max_features, embedding_dim),
    layers.Bidirectional(layers.LSTM(units=64, return_sequences=True)),
    layers.GlobalMaxPooling1D(),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              loss=losses.BinaryCrossentropy(),
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)])

epochs = 30
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=[early_stopping] # Add the early stopping callback
)

loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

history_dict = history.history
history_dict.keys()

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

Kelly_McTiernan · August 24, 2024, 9:34pm

Sentiment Analysis (2 of 2):

# Now we create a binary prediction using sigmoid activation
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy, *all_else = export_model.evaluate(raw_test_ds)
print(accuracy)

examples = tf.constant([
    "The movie was great! I really loved it. One of the best movies I ever saw.",
    "The movie was okay. I have seen better, but I enjoyed it.",
    "The movie was terrible. Hated it. Never have I seen such a bad film.",
    "I liked the movie. It was entertaining, and I enjoyed it a lot.",
    "The movie wasn't all that hot. I would not see it again. I give it a thumbs down."
])

results = export_model.predict(examples)
print(results)

display_results = [(example.numpy().decode("utf-8"), "Good" if value[0] > 0.69 else "Bad") 
                   for example, value in zip(examples, results)]

# Convert the list to a pandas DataFrame
df = pd.DataFrame(display_results, columns=["Review", "Sentiment"])

# Print the table
print(df)

Topic		Replies	Views
Text generator modify General Discussion help_request	1	865	January 23, 2024
Getting NaN for loss General Discussion models , datasets , keras , help_request	3	28570	November 9, 2021
Working with multiple errors at once General Discussion help_request	1	759	July 20, 2023
Trying to create a english story generator with BERT model and graph execution error came unwanted Keras models	1	861	December 19, 2024
Model overfitting very fast General Discussion models , nlp , help_request	1	1195	August 21, 2021

Next in the tutorial - Sentiment Analysis

Related topics