Dear All,
Could anyone help me to fix this error when i try to use 2GPU to train?
I am using tensorflow 2.10 cuda 11.2 python 3.10.4
I checked my computer has 2 GPUs by using this function.
tf.config.list_physical_devices('GPU')
[PhysicalDevice(name=‘/physical_device:GPU:0’, device_type=‘GPU’),
PhysicalDevice(name=‘/physical_device:GPU:1’, device_type=‘GPU’)]
Bellow is my code:
import tensorflow as tf
# Define the distribution strategy
strategy = tf.distribute.MirroredStrategy(devices=["GPU:0", "GPU:1"])
# Define the CNN model
with strategy.scope():
model = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(128,128,3)),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(128, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Define the data generators
train_gen = df.flow_from_directory(
"../New folder/dataset/Train",
batch_size=128,
class_mode='binary',
target_size=(128, 128)
)
val_gen = df.flow_from_directory(
"../New folder/dataset/Validation/",
batch_size=128,
class_mode='binary',
target_size=(128, 128)
)
test_gen = df_test.flow_from_directory(
'../New folder/dataset/Test/',
target_size=(128, 128),
batch_size=128,
class_mode='binary'
)
# Train the model with the generators
with strategy.scope():
history = model.fit(train_gen, epochs=10, validation_data=val_gen)
# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(test_gen)
print('Test accuracy:', test_acc)
here is the error message:
Output exceeds the size limit. Open the full output data in a text editor
--------------------------------------------------------------------------- InvalidArgumentError Traceback (most recent call last) Cell In[14], line 45 43 # Train the model with the generators 44 with strategy.scope(): —> 45 history = model.fit(train_gen, epochs=10, validation_data=val_gen) 47 # Evaluate the model on the test data 48 test_loss, test_acc = model.evaluate(test_gen) File [c:\Users\efml\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py:70](file:///C:/Users/efml/AppData/Local/Programs/Python/Python310/lib/site-packages/keras/utils/traceback_utils.py:70), in filter_traceback…error_handler(*args, **kwargs) 67 filtered_tb = _process_traceback_frames(e.traceback) 68 # To get the full stack trace, call: 69 # tf.debugging.disable_traceback_filtering()
—> 70 raise e.with_traceback(filtered_tb) from None 71 finally: 72 del filtered_tb File [c:\Users\efml\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\eager\execute.py:54](file:///C:/Users/efml/AppData/Local/Programs/Python/Python310/lib/site-packages/tensorflow/python/eager/execute.py:54), in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name) 52 try: 53 ctx.ensure_initialized() —> 54 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, 55 inputs, attrs, num_outputs) 56 except core._NotOkStatusException as e: 57 if name is not None:
…
Registered devices: [CPU, GPU] Registered kernels: [[Adam/NcclAllReduce]] [Op:__inference_train_function_10553]