Tensorflow 2.17 slow on apple silicon when training neural nets

I use a MacBook Pro (M2):
TF 2.15 (also tensorflow-macos) works perfectly fine with Apple Silicon (tensor flow-metal is also installed). Recently, I upgraded to TF 2.17 and got a dramatic decrease in performance when training neural nets. I have tried several variations (also tf 2.16) without success.

Note: The typical GPU test (convolutions) works in TF 2.17 (gpu is found, speedup 10.6x), but the training of neural nets is extremely slow.

Maybe Keras 3.0, which does not offer the legacy version of Adam might cause the decrease in perfomance.

Hi @Thomas_Bayer, If possible could you please share the code for which you are facing the performance issue. Thank You.

Here is the code (generation of Shakespeare text).
Note: In all cases, tensorflow-metal is installed
tensorflow-macos==2.15 is quite fast (less than 3 minutes per epoch)
tensorflow-macos==2.16 is very slow
tensorflow==2.17 is very slow

import numpy as np
import os
import tensorflow as tf
from tensorflow import keras

import timeit

print("Tensorflow v", tf.__version__)
if tf.__version__ >= "2.17.0":
    print("Keras v", tf.keras.__version__)

gpus = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpus))
print(gpus)

if gpus:
    enable = True
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            print(f"setting memory growth for {gpu} to {enable}")
            tf.config.experimental.set_memory_growth(gpu, enable)
        # tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration
        # (memory_limit=2048)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

print(shakespeare_text[:148])

np.random.seed(42)
tf.random.set_seed(42)

batch_size = 128
n_steps = 100

tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(shakespeare_text)
config = tokenizer.get_config()

print(tokenizer.texts_to_sequences(["First"]))
print(tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]]))

max_id = len(tokenizer.word_index)  # number of distinct characters
dataset_size = tokenizer.document_count  # total number of characters

print("max_id = {}, dataset_size = {:,.0f}".format(max_id, dataset_size))

[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

window_length = n_steps + 1  # target = input shifted 1 character ahead
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.shuffle(100_000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

dataset = dataset.map(
        lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
        )
dataset = dataset.cache().prefetch(tf.data.AUTOTUNE)

if tf.__version__ <= "2.16.0":
    print("Using legacy keras")
    optimizer = keras.optimizers.legacy.Adam()
else:
    print("Using new keras")
    optimizer = keras.optimizers.Adam()

m1 = keras.models.Sequential(
        [
                keras.layers.GRU(
                        128, return_sequences=True, input_shape=[None, max_id],
                        dropout=0, recurrent_dropout=0
                        ),
                keras.layers.TimeDistributed(
                        keras.layers.Dense(
                                max_id,
                                activation="softmax"
                                )
                        )
                ], name="Single_Layer_GRU128"
        )
m1.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer)

LOSS_THRESHOLD = 0.1


class OnThresholdStop(tf.keras.callbacks.Callback):
    def __init__(self, threshold=LOSS_THRESHOLD):
        self.threshold = threshold

    def on_train_batch_end(self, epoch, logs={}):
        current_loss = logs.get('loss')
        if current_loss < self.threshold:
            print("\n Reached loss = {:.3f}, so stopping training!!".format(current_loss))
            self.model.stop_training = True


early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        min_delta=0.005, patience=1,
        mode='auto',
        verbose=True,
        restore_best_weights=True
        )

model = m1
epochs = 2

steps_per_epoch = train_size // batch_size

loss_threshold = OnThresholdStop(0.1)
print("steps per epoch = {:,.0f}".format(steps_per_epoch))

start = timeit.default_timer()
history = model.fit(
        dataset, steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=[early_stopping, loss_threshold]
        )
stop = timeit.default_timer()
print('Time: ', stop - start)

Tensorflow GPU training using Keras v3 is slower than on Keras v2.

Try setting the TF_USE_LEGACY_KERAS environment variable.

import os
os.environ[‘TF_USE_LEGACY_KERAS’] = ‘1’

Put this near the top of your file. That should speed it up significantly.