Doing compile () twice (whether intentionally or accidentally) dissociates the losses of test_on_batch () and train_on_batch ().
Below is the source code that reproduces this phenomenon.
import os
os.environ["PYTHONHASHSEED"]=str(1234)
import numpy as np
import unittest
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Activation, Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import load_model
import tensorflow as tf
import random as python_random
np.random.seed(123)
python_random.seed(123)
tf.random.set_seed(1234)
initializer = tf.keras.initializers.GlorotUniform(seed=42)
def get_model():
model = Sequential()
model.add(Input(shape=(3,)))
model.add(BatchNormalization())
model.add(Dense(10,kernel_initializer=initializer))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(4,kernel_initializer=initializer))
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
return model
x_data = [
[0.0, 0.0, 0.0],
[0.0, 0.0, 1.0],
[0.0, 1.0, 0.0],
[0.0, 1.0, 1.0],
[1.0, 0.0, 0.0],
[1.0, 0.0, 1.0],
[1.0, 1.0, 0.0],
[1.0, 1.0, 1.0],
]
y_data = [
[0.0, 0.0, 0.0, 1.0],
[0.0, 1.0, 1.0, 1.0],
[0.0, 1.0, 1.0, 1.0],
[0.0, 1.0, 0.0, 0.0],
[0.0, 1.0, 1.0, 1.0],
[0.0, 1.0, 0.0, 0.0],
[0.0, 1.0, 0.0, 0.0],
[1.0, 1.0, 1.0, 0.0],
]
x_data_keras = np.array(x_data)
y_data_keras = np.array(y_data)
max_loop = 500
# training 1
model = get_model()
for i in range(max_loop):
result_pre = model.test_on_batch(x_data_keras, y_data_keras)
result_train = model.train_on_batch(x_data_keras, y_data_keras)
result_post = model.test_on_batch(x_data_keras, y_data_keras)
print(result_pre, result_train, result_post)
for i in range(max_loop):
result_pre = model.test_on_batch(x_data_keras, y_data_keras)
result_train = model.train_on_batch(x_data_keras, y_data_keras)
result_post = model.test_on_batch(x_data_keras, y_data_keras)
print(result_pre, result_train, result_post)
print()
# training 2
model = get_model()
for i in range(max_loop):
result_pre = model.test_on_batch(x_data_keras, y_data_keras)
result_train = model.train_on_batch(x_data_keras, y_data_keras)
result_post = model.test_on_batch(x_data_keras, y_data_keras)
print(result_pre, result_train, result_post)
model.compile(optimizer='adam', loss='mean_squared_error') # re-compile
print('re-compile')
for i in range(max_loop):
result_pre = model.test_on_batch(x_data_keras, y_data_keras)
result_train = model.train_on_batch(x_data_keras, y_data_keras)
result_post = model.test_on_batch(x_data_keras, y_data_keras)
print(result_pre, result_train, result_post)
print()
A part of the output result is put below.
Layer (type) Output Shape Param #
=================================================================
batch_normalization (BatchNo (None, 3) 12
_________________________________________________________________
dense (Dense) (None, 10) 40
_________________________________________________________________
batch_normalization_1 (Batch (None, 10) 40
_________________________________________________________________
activation (Activation) (None, 10) 0
_________________________________________________________________
dense_1 (Dense) (None, 4) 44
=================================================================
Total params: 136
Trainable params: 110
Non-trainable params: 26
_________________________________________________________________
0.6977941393852234 1.1270804405212402 0.6935483813285828
0.6935483813285828 1.1136053800582886 0.6893106698989868
0.6893106698989868 1.1002421379089355 0.6850830316543579
0.6850830316543579 1.0869929790496826 0.6808691620826721
0.6808691620826721 1.0738604068756104 0.6766723990440369
:
:
0.03179752826690674 0.03175930678844452 0.031733617186546326
0.031733617186546326 0.03169688582420349 0.031668905168771744
0.031668905168771744 0.03162558749318123 0.03159947693347931
0.03159947693347931 0.031562428921461105 0.03153043985366821
0.03153043985366821 0.03150099143385887 0.03146739304065704
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
batch_normalization_2 (Batch (None, 3) 12
_________________________________________________________________
dense_2 (Dense) (None, 10) 40
_________________________________________________________________
batch_normalization_3 (Batch (None, 10) 40
_________________________________________________________________
activation_1 (Activation) (None, 10) 0
_________________________________________________________________
dense_3 (Dense) (None, 4) 44
=================================================================
Total params: 136
Trainable params: 110
Non-trainable params: 26
_________________________________________________________________
0.6977941393852234 1.1270804405212402 0.6935483813285828
0.6935483813285828 1.1136053800582886 0.6893106698989868
0.6893106698989868 1.1002421379089355 0.6850830316543579
0.6850830316543579 1.0869929790496826 0.6808691620826721
0.6808691620826721 1.0738604068756104 0.6766723990440369
:
:
0.001201115665026009 0.00034435521229170263 0.001191912218928337
0.001191912218928337 0.00033795437775552273 0.0011880495585501194
0.0011880495585501194 0.0003311052278149873 0.0011792024597525597
0.0011792024597525597 0.0003247302083764225 0.0011489215539768338
0.0011489215539768338 0.000317717669531703 0.0011202542809769511
training 1 does train_on_batch () 1000 times.
The final loss at the 1000th time is 0.03153043985366821 0.03150099143385887 0.03146739304065704, and there is little difference between the losses shown by test_on_batch () and train_on_batch ().
training 2 does 500 train_on_batch (), compile (), and 500 more train_on_batch ().
The final loss for the 1000th time is 0.0011489215539768338 0.000317717669531703 0.0011202542809769511, which is an order of magnitude different. (The difference in loss is getting bigger and bigger.)
Do you know what causes this phenomenon?
And if this happens, how can I avoid it?