Hello!
I want to use cosine annealing learning rate (LR) schedule in my CNN model. When I check the LR in each epoch, I found the LR in each epoch remained unchanged, and the global_step (G_steps variable) remain zero through out the training process. The code snippet of my model is pasted below:
#Create Optimizer
G_steps = tf.Variable(0, name="global_step", trainable=False)
lr_decayed = tf.compat.v1.train.cosine_decay(learning_rate=1e-3, global_step=G_steps, decay_steps=3, alpha=1e-5)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_decayed)
#Fit model
# Keep results for plotting
CurrentLoss = 100*np.ones((1,Epochs),dtype=float)
train_loss_results = []
#train_accuracy_results = []
with tf.device('/device:GPU:2'):
for epoch in range(Epochs):
# epoch_loss_avg = tf.keras.metrics.Mean()
# epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
CurrentLoss[0,epoch] = 0
# Training loop - using batches of 'batch_size'
for Inputs, y in train_loader:
# Optimize the model
loss_value, grads = grad(model, Inputs, y)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
CurrentLoss[0,epoch] = CurrentLoss[0,epoch]+loss_value
# Track progress
##epoch_loss_avg.update_state(loss_value) # Add current batch loss
# Compare predicted label to actual label
# training=True is needed only if there are layers with different
# behavior during training versus inference (e.g. Dropout).
##epoch_accuracy.update_state(y, model(x, training=True))
# End epoch
# train_loss_results.append(epoch_loss_avg.result())
# train_accuracy_results.append(epoch_accuracy.result())
#Save Best Model
if epoch==0:
saveModels(model,Save_Path)
elif CurrentLoss[0,epoch]<min(CurrentLoss[0,0:epoch]):
saveModels(model,Save_Path)
#print("Epoch {:08d}: Loss: {:.10e}".format(epoch,CurrentLoss[0,epoch]))
print("Epoch %d Loss: %.15e"%(epoch,CurrentLoss[0,epoch]))
print("Learning Rate: %.10e"%(optimizer.lr.numpy().item()))
print("Global Step: %d"%(G_steps))
with some used functions defined below:
loss_object = tf.keras.losses.MeanSquaredError()
def loss(model, x, y, training):
# training=training is needed only if there are layers with different
# behavior during training versus inference (e.g. Dropout).
y_ = model(x, training=training)
return loss_object(y_true=y, y_pred=y_)
def grad(model, inputs, targets):
with tf.GradientTape() as tape:
loss_value = loss(model, inputs, targets, training=True)
return loss_value, tape.gradient(loss_value, model.trainable_variables)
Could anyone help to find out the reasons, Thanks!!!