I want to train multiple Keras models concurrently, using multiprocessing. And, I want to do so using many GPUs on one machine. This is actually part of a larger algorithm whereby each model will be trained using a different optimizer; but, for example sake, I have put together the following python code which exemplifies, roughly, what I’m trying to achieve.
import tensorflow as tf
import multiprocessing as mp
from tensorflow import keras
from tensorflow.keras import layers
# Create a strategy for multi-GPU
strategy = tf.distribute.MirroredStrategy()
# Define function to load and preprocess the data
def get_dataset():
batch_size = 32
num_val_samples = 10000
# Return the MNIST dataset in the form of a `tf.data.Dataset`
(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
# Preprocess the data (these are Numpy arrays)
x_train = x_train.reshape(-1, 28, 28, 1).astype("float32") / 255
x_test = x_test.reshape(-1, 28, 28, 1).astype("float32") / 255
y_train = y_train.astype("float32")
y_test = y_test.astype("float32")
# Reserve num_val_samples samples for validation
x_val = x_train[-num_val_samples:]
y_val = y_train[-num_val_samples:]
x_train = x_train[:-num_val_samples]
y_train = y_train[:-num_val_samples]
return (
tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size),
tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(batch_size),
tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(batch_size),
)
def train_model(num):
print(num)
# Open a strategy scope and create/compile the Keras model
with strategy.scope():
# AlexNet model
model = keras.Sequential(
[
layers.Conv2D(96, 11, 4, activation='relu', padding='same', input_shape=(28, 28, 1)),
layers.BatchNormalization(),
layers.MaxPool2D(3, 2, padding='same'),
layers.Conv2D(256, 5, 1, activation='relu', padding='same'),
layers.BatchNormalization(),
layers.MaxPool2D(3, 2, padding='same'),
layers.Conv2D(384, 3, 1, activation='relu', padding='same'),
layers.BatchNormalization(),
layers.Conv2D(384, 3, 1, activation='relu', padding='same'),
layers.BatchNormalization(),
layers.Conv2D(256, 3, 1, activation='relu', padding='same'),
layers.BatchNormalization(),
layers.MaxPool2D(3, 2, padding='same'),
layers.Flatten(),
layers.Dense(4096, activation='relu'),
layers.Dropout(0.5),
layers.Dense(4096, activation='relu'),
layers.Dropout(0.5),
layers.Dense(10, activation='softmax'),
])
model.compile(loss=keras.losses.SparseCategoricalCrossentropy(),
optimizer=keras.optimizers.Adam(),
metrics=['accuracy'])
# Get the datasets
train_dataset, val_dataset, test_dataset = get_dataset()
# Train the model
model.fit(train_dataset, epochs=10, validation_data=val_dataset, use_multiprocessing=True)
# Evaluate the model
test_loss, test_acc = model.evaluate(test_dataset)
return (num, test_acc)
if __name__ == "__main__":
pool = mp.Pool()
fitnesses = pool.map(train_model, [1, 2])
print(fitnesses)
Currently the above example hangs after logging the following twice (1 for each process, I assume): F tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:147] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error
My question is, is the approach I’m following reasonable (that is, using python’s multiprocessing module and tf.distribute.MirroredStrategy to train multiple models concurrently and using many GPUs on one machine)? A lot of this is quite new to me so it’s likely I may be missing something obvious. But, if the approach is reasonable, what causes the CUDA_ERROR_NOT_INITIALIZED
and how can I fix it?
Additional details:
- Python version: 3.10.12
- Tensorflow version: 2.12.0
- CUDA version: 11.8