I am trying to convert model.fit() in Keras to the eager mode training. The model is an autoencoder. It has one encoder and two decoders. The decoders have different loss functions. The losses for decoders in eager model and model.fit are the same. I tried to set everything as the model.fit(). But the losses are different. I really appreciate help me out.
The link for google colab: Google Colab
In the following, the definition and training of the model are shown. I use model.fit() for training. Also, in the end, the output is shown, which shows the values for losses.
def fit_ae (x_unlab, p_m, alpha, parameters):
# Parameters
_, dim = x_unlab.shape
epochs = parameters['epochs']
batch_size = parameters['batch_size']
# Build model
inputs = contrib_layers.Input(shape=(dim,))
# Encoder
h = contrib_layers.Dense(int(256), activation='relu', name='encoder1')(inputs)
h = contrib_layers.Dense(int(128), activation='relu', name='encoder2')(h)
h = contrib_layers.Dense(int(26), activation='relu', name='encoder3')(h)
# Mask estimator
output_1 = contrib_layers.Dense(dim, activation='sigmoid', name = 'mask')(h)
# Feature estimator
output_2 = contrib_layers.Dense(dim, activation='sigmoid', name = 'feature')(h)
#Projection Network
model = Model(inputs = inputs, outputs = [output_1, output_2])
model.compile(optimizer='rmsprop',
loss={'mask': 'binary_crossentropy',
'feature': 'mean_squared_error'},
loss_weights={'mask':1, 'feature':alpha})
m_unlab = mask_generator(p_m, x_unlab)
m_label, x_tilde = pretext_generator(m_unlab, x_unlab)
# Fit model on unlabeled data
model.fit(x_tilde, {'mask': m_label, 'feature': x_unlab}, epochs = epochs, batch_size= batch_size)
########### OUTPUT
Epoch 1/15
4/4 [==============================] - 1s 32ms/step - loss: 1.0894 - mask_loss: 0.6560 - feature_loss: 0.2167
Epoch 2/15
4/4 [==============================] - 0s 23ms/step - loss: 0.6923 - mask_loss: 0.4336 - feature_loss: 0.1293
Epoch 3/15
4/4 [==============================] - 0s 26ms/step - loss: 0.4720 - mask_loss: 0.3022 - feature_loss: 0.0849
Epoch 4/15
4/4 [==============================] - 0s 23ms/step - loss: 0.4054 - mask_loss: 0.2581 - feature_loss: 0.0736
In the following code, I implemented the above code in eager mode. I set all optimizer and loss functions the same as the above code. Data are the same for training both model.
###################################################### MODEL AUTOENCODER ============================================
def eager_ae(x_unlab,p_m,alpha,parameters):
# import pdb; pdb.set_trace()
_, dim = x_unlab.shape
epochs = parameters['epochs']
batch_size = parameters['batch_size']
E = keras.Sequential([
Input(shape=[dim,]),
Dense(256,activation='relu'),
Dense(128,activation='relu'),
Dense(26,activation='relu'),
])
# Mask estimator
output_1 = keras.Sequential([
Dense(dim,activation='sigmoid'),
])
# Feature estimator
output_2 = keras.Sequential([
Dense(dim,activation='sigmoid'),
])
optimizer = tf.keras.optimizers.RMSprop()
loss_mask = tf.keras.losses.BinaryCrossentropy()
loss_feature = tf.keras.losses.MeanSquaredError()
# Generate corrupted samples
m_unlab = mask_generator(p_m, x_unlab)
m_label, x_tilde = pretext_generator(m_unlab, x_unlab)
for epoch in range(epochs):
loss_metric = tf.keras.metrics.Mean(name='train_loss')
len_batch = range(int(x_unlab.shape[0]/batch_size))
for i in len_batch:
samples = x_tilde[i*batch_size:(i+1)*batch_size]
mask = m_label[i*batch_size:(i+1)*batch_size]
# train_step(samples,tgt)
with tf.GradientTape() as tape:
latent = E(samples, training=True)
out_mask = output_1(latent)
out_feat = output_2(latent)
# import pdb; pdb.set_trace()
lm = loss_mask(out_mask,tf.Variable(mask,dtype=tf.float32))
lf = loss_feature(out_feat,tf.Variable(samples,dtype=tf.float32))
pred_loss = lm + alpha*lf
trainable_vars = E.trainable_weights+output_1.trainable_weights+output_2.trainable_weights
grads = tape.gradient(pred_loss, trainable_vars)
optimizer.apply_gradients(zip(grads, trainable_vars))
loss_metric.update_state(pred_loss)
print(f'Epoch {epoch}, Loss {loss_metric.result()}')
return E
############# OUTPUT
Epoch 0, Loss 7.902271747589111
Epoch 1, Loss 5.336598873138428
Epoch 2, Loss 2.880791664123535
Epoch 3, Loss 1.9296690225601196
Epoch 4, Loss 1.6377944946289062
Epoch 5, Loss 1.5342860221862793
Epoch 6, Loss 1.5015968084335327
Epoch 7, Loss 1.4912563562393188
The total loss in the first code is less than zero, while the total loss in the second code is more than 1 . I can not find the issue in my second implementation (the second code).