I’m running into a performance difference while running the same model with and without the keras.Sequential
object definition inside a custom class which inherits from keras.Model
.
The following code demonstrates the issue:
Code for the model which uses keras.Sequential
:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import cifar10
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
class ConvModel_1(keras.Model):
def __init__(self, input_shape):
super().__init__()
self.input_image_shape = input_shape
self.mdl = keras.Sequential([
layers.Input(shape=input_shape),
layers.Conv2D(32, 3),
layers.BatchNormalization(),
layers.MaxPool2D(),
layers.Conv2D(64, 5),
layers.BatchNormalization(),
layers.MaxPool2D(),
layers.Conv2D(128, 3, kernel_regularizer=keras.regularizers.l2(0.01)),
layers.BatchNormalization(),
layers.Flatten(),
layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
layers.Dropout(0.5),
layers.Dense(10)
])
def call(self, inputs):
return self.mdl(inputs)
Code for the model which does not use keras.Sequential
:
class ConvModel_2(keras.Model):
def __init__(self, input_shape):
super().__init__()
self.input_image_shape = input_shape
self.inp = layers.Input(shape=input_shape)
self.c2d_32 = layers.Conv2D(32, 3)
self.bnrm_1 = layers.BatchNormalization()
self.mp_1 = layers.MaxPool2D()
self.c2d_64 = layers.Conv2D(64, 5)
self.bnrm_2 = layers.BatchNormalization()
self.mp_2 = layers.MaxPool2D()
self.c2d_128 = layers.Conv2D(128, 3, kernel_regularizer=keras.regularizers.l2(0.01))
self.bnrm_3 = layers.BatchNormalization()
self.flt = layers.Flatten()
self.dns_64 = layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01))
self.do = layers.Dropout(0.5)
self.dns_10 =layers.Dense(10)
def call(self, inputs):
x = self.c2d_32(inputs)
x = self.bnrm_1(x)
x = keras.activations.relu(x)
x = self.mp_1(x)
x = self.c2d_64(x)
x = self.bnrm_2(x)
x = keras.activations.relu(x)
x = self.mp_2(x)
x = self.c2d_128(x)
x = self.bnrm_3(x)
x = keras.activations.relu(x)
x = self.flt(x)
x = self.dns_64(x)
x = self.do(x)
return self.dns_10(x)
def model(self):
x = keras.Input(shape=self.input_image_shape)
return keras.Model(inputs=[x], outputs=self.call(x))
Main:
if __name__=='__name__':
print(tf.config.list_physical_devices('GPU'))
# Load the data
(X, y), (X_test, y_test) = cifar10.load_data()
X, X_test = X.astype(np.float32) / 255.0, X_test.astype(np.float32) / 255.0
w, h, c = X.shape[1], X.shape[2], X.shape[3]
print(w, h, c)
# Model with keras.Sequential
model_1 = ConvModel_1(input_shape=(w, h, c))
model_1.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(learning_rate=3e-4), metrics=['accuracy'])
model_1.fit(X, y, batch_size=64, epochs=30)
model.fit(X, y, batch_size=64, epochs=15)
model_1.evaluate(X_test, y_test, batch_size=64)
print(model_1.mdl.summary())
# Model without keras.Sequential
model_2 = ConvModel_2(input_shape=(w, h, c))
model_2.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(learning_rate=3e-4), metrics=['accuracy'])
model_2.fit(X, y, batch_size=64, epochs=30)
model_2.fit(X, y, batch_size=64, epochs=15)
model_2.evaluate(X_test, y_test, batch_size=64)
print(model_2.model().summary())
so after I fit both models for 30
epochs on CIFAR10
dsataset, the first model (i.e., the one where I use the keras.Sequential
object to build the model) performes significantly worse then the second model (i.e., the one where I do not use the keras.Sequential
object to build it) model:
For ConvModel_1
:
Train:
Epoch 30/30
782/782 [==============================] - 13s 16ms/step - loss: 1.0373 - accuracy: 0.68
---
Test:
157/157 [==============================] - 1s 7ms/step - loss: 1.2568 - accuracy: 0.61
For ConvModel_2
:
Train:
Epoch 30/30
782/782 [==============================] - 13s 17ms/step - loss: 0.6688 - accuracy: 0.83
---
Test:
157/157 [==============================] - 2s 8ms/step - loss: 1.0541 - accuracy: 0.72
This doesn’t make sense to me, as the models are identical (besides the input layers, which has no trainable parameters anyway):
ConvModel_1:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 30, 30, 32) 896
_________________________________________________________________
batch_normalization (BatchNo (None, 30, 30, 32) 128
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 15, 15, 32) 0
_________________________________________________________________
conv2d_1 (Conv2D) (None, 11, 11, 64) 51264
_________________________________________________________________
batch_normalization_1 (Batch (None, 11, 11, 64) 256
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 5, 5, 64) 0
_________________________________________________________________
conv2d_2 (Conv2D) (None, 3, 3, 128) 73856
_________________________________________________________________
batch_normalization_2 (Batch (None, 3, 3, 128) 512
_________________________________________________________________
flatten (Flatten) (None, 1152) 0
_________________________________________________________________
dense (Dense) (None, 64) 73792
_________________________________________________________________
dropout (Dropout) (None, 64) 0
_________________________________________________________________
dense_1 (Dense) (None, 10) 650
=================================================================
Total params: 201,354
Trainable params: 200,906
Non-trainable params: 448
_________________________________________________________________
ConvModel_2:
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_3 (InputLayer) [(None, 32, 32, 3)] 0
_________________________________________________________________
conv2d_3 (Conv2D) (None, 30, 30, 32) 896
_________________________________________________________________
batch_normalization_3 (Batch (None, 30, 30, 32) 128
_________________________________________________________________
tf.nn.relu (TFOpLambda) (None, 30, 30, 32) 0
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 15, 15, 32) 0
_________________________________________________________________
conv2d_4 (Conv2D) (None, 11, 11, 64) 51264
_________________________________________________________________
batch_normalization_4 (Batch (None, 11, 11, 64) 256
_________________________________________________________________
tf.nn.relu_1 (TFOpLambda) (None, 11, 11, 64) 0
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 5, 5, 64) 0
_________________________________________________________________
conv2d_5 (Conv2D) (None, 3, 3, 128) 73856
_________________________________________________________________
batch_normalization_5 (Batch (None, 3, 3, 128) 512
_________________________________________________________________
tf.nn.relu_2 (TFOpLambda) (None, 3, 3, 128) 0
_________________________________________________________________
flatten_1 (Flatten) (None, 1152) 0
_________________________________________________________________
dense_2 (Dense) (None, 64) 73792
_________________________________________________________________
dropout_1 (Dropout) (None, 64) 0
_________________________________________________________________
dense_3 (Dense) (None, 10) 650
=================================================================
Total params: 201,354
Trainable params: 200,906
Non-trainable params: 448
_________________________________________________________________
Just to note: I’m aware of the fact that each time the weights are randomly initialized, but the behavior described above is rather consistent over 10 runs, which renders this possibility quite not likely.
What am I missing here?
Thanks in advance for any help.