Hi,
The following custom training loop raises the error: ‘Allocator (GPU_0_bfc) ran out of memory’ after a couple of epochs.
gwd_model = tf.keras.Sequential([layers.Dense(384, activation = "relu"),
layers.Dense(16, activation = "relu"),
layers.Dense(16, activation = "relu"),
layers.Dense(1, activation = return_scaled_sigmoid)])
optimizer = tf.keras.optimizers.AdamW()
@tf.function
def apply_gradients(gradients):
optimizer.apply_gradients(zip(gradients, gwd_model.trainable_variables))
NEPOCHS = 40
NEPOCHS_MAX = 200
#os.remove('/tf/stop')
loss_function = tf.keras.losses.Huber()
ntraining_batches = len(list(training_dataset))
nval_batches = len(list(val_dataset))
print(f"Number of training batches: {ntraining_batches}, number of validation batches: {nval_batches}")
tf.keras.backend.set_value(optimizer.learning_rate, 1.0e-3)
best_val_loss = float('inf')
patience = 2 # Number of epochs to wait before reducing LR
wait = 0 # Counter for epochs waited
factor = 0.96 # Factor by which to reduce LR
for epoch in range(NEPOCHS_MAX):
ntraining_batch = int(epoch * ntraining_batches / NEPOCHS)
ntraining_batch = max(ntraining_batch, 1)
ntraining_batch = min(ntraining_batch, ntraining_batches)
#ntraining_batch = ntraining_batches
nval_batch = int(epoch * nval_batches / NEPOCHS)
nval_batch = max(nval_batch, 1)
nval_batch = min(nval_batch, nval_batches)
#nval_batch = nval_batches
print(f"Epoch: {epoch}, Training batches: {ntraining_batch}, Validation batches: {nval_batch}")
loss_mean = tf.keras.metrics.Mean()
current_time = datetime.now()
current_time_string = current_time.strftime("%H%M%S-%Y%m%d")
print("current_time:", current_time_string)
dataset = training_dataset.take(ntraining_batch)
for dataset_features, dataset_labels in dataset:
#print("ibatch", ibatch, " nbatches=", ntraining_batches)
#loss = train_step(gwd_model, dataset_features, dataset_labels);
with tf.GradientTape() as tape:
predictions = gwd_model(dataset_features, training=True)
loss = loss_function(dataset_labels, predictions)
gradients = tape.gradient(loss, gwd_model.trainable_variables)
apply_gradients(gradients)
#optimizer.apply_gradients(zip(gradients, gwd_model.trainable_variables))
loss_mean.update_state(loss)
loss = loss_mean.result()
val_loss_mean = tf.keras.metrics.Mean()
current_time = datetime.now()
current_time_string = current_time.strftime("%H%M%S-%Y%m%d")
print("current_time:", current_time_string)
dataset = val_dataset.take(nval_batch)
for dataset_features, dataset_labels in dataset:
predictions = gwd_model(dataset_features, training=False)
val_loss = loss_function(dataset_labels, predictions)
val_loss_mean.update_state(val_loss)
val_loss = val_loss_mean.result()
learning_rate = optimizer.learning_rate
print(f"Epoch {epoch}: Loss: {loss.numpy():.4e}, Validation Loss: {val_loss.numpy():.4e}, Learning Rate: {learning_rate.numpy():.4e}")
if val_loss < best_val_loss:
best_val_loss = val_loss
wait = 0
else:
wait += 1
if wait >= patience:
learning_rate = learning_rate * factor
tf.keras.backend.set_value(optimizer.learning_rate, learning_rate)
print(f"New learning rate: {learning_rate:.4e}.")
wait = 0
logs = {'loss': loss, 'val_loss': val_loss}
current_time = datetime.now()
current_time_string = current_time.strftime("%H%M%S-%Y%m%d")
layer_units = [str(layer.units) for layer in gwd_model.layers if hasattr(layer, 'units')]
layer_units_string = 'x'.join(layer_units)
postfix = f"{layer_units_string}-epoch-{epoch+1}-val_loss-{val_loss:.8f}-{current_time_string}.csv"
for ilayer, layer in enumerate(gwd_model.layers):
pd.DataFrame(layer.weights[0]).to_csv(f"/tmp5/gwies/tf/weights{ilayer}-{postfix}", header=False, index=False)
pd.DataFrame(layer.weights[1]).to_csv(f"/tmp5/gwies/tf/bias{ilayer}-{postfix}" , header=False, index=False)
if (epoch > NEPOCHS) and (val_loss < 1.0e-5):
break
if learning_rate < 1.0e-6:
break
stop = '/tf/stop'
if os.path.exists(stop):
print(f"\nStopping training as '{stop}' exists.")
break
The batch size is 65536 and the loop tries to allocate a tensor with shape [65536, 384] which is the [batch size, the number of units of the first dense layer] of type float in the context of ReluGrad. Should such a tensor not have been allocated once before training not after epoch number 14? The allocator also prints the heap, and there are about 20.000 objects with size 262144 (which looks like 65536 floats). The number of batches is slowly increased in each epoch, but if I train the model using gwd_model.fit() all batches are used at epoch 1 and I do not get the error even after 100 epochs.
Any suggestions what could be causing allocator to run out of memory on the GPU?
Regards,
GW