I’m running into a performance difference while running the same model with and without the keras.Sequential
object definition inside a custom class which inherits from keras.Model
The following code demonstrates the issue:
Code for the model which uses keras.Sequential
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import cifar10
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
class ConvModel_1(keras.Model):
def __init__(self, input_shape):
self.input_image_shape = input_shape
self.mdl = keras.Sequential([
layers.Conv2D(32, 3),
layers.Conv2D(64, 5),
layers.Conv2D(128, 3, kernel_regularizer=keras.regularizers.l2(0.01)),
layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
def call(self, inputs):
return self.mdl(inputs)
Code for the model which does not use keras.Sequential
class ConvModel_2(keras.Model):
def __init__(self, input_shape):
self.input_image_shape = input_shape
self.inp = layers.Input(shape=input_shape)
self.c2d_32 = layers.Conv2D(32, 3)
self.bnrm_1 = layers.BatchNormalization()
self.mp_1 = layers.MaxPool2D()
self.c2d_64 = layers.Conv2D(64, 5)
self.bnrm_2 = layers.BatchNormalization()
self.mp_2 = layers.MaxPool2D()
self.c2d_128 = layers.Conv2D(128, 3, kernel_regularizer=keras.regularizers.l2(0.01))
self.bnrm_3 = layers.BatchNormalization()
self.flt = layers.Flatten()
self.dns_64 = layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01))
self.do = layers.Dropout(0.5)
self.dns_10 =layers.Dense(10)
def call(self, inputs):
x = self.c2d_32(inputs)
x = self.bnrm_1(x)
x = keras.activations.relu(x)
x = self.mp_1(x)
x = self.c2d_64(x)
x = self.bnrm_2(x)
x = keras.activations.relu(x)
x = self.mp_2(x)
x = self.c2d_128(x)
x = self.bnrm_3(x)
x = keras.activations.relu(x)
x = self.flt(x)
x = self.dns_64(x)
x = self.do(x)
return self.dns_10(x)
def model(self):
x = keras.Input(shape=self.input_image_shape)
return keras.Model(inputs=[x], outputs=self.call(x))
if __name__=='__name__':
# Load the data
(X, y), (X_test, y_test) = cifar10.load_data()
X, X_test = X.astype(np.float32) / 255.0, X_test.astype(np.float32) / 255.0
w, h, c = X.shape[1], X.shape[2], X.shape[3]
print(w, h, c)
# Model with keras.Sequential
model_1 = ConvModel_1(input_shape=(w, h, c))
model_1.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(learning_rate=3e-4), metrics=['accuracy'])
model_1.fit(X, y, batch_size=64, epochs=30)
model.fit(X, y, batch_size=64, epochs=15)
model_1.evaluate(X_test, y_test, batch_size=64)
# Model without keras.Sequential
model_2 = ConvModel_2(input_shape=(w, h, c))
model_2.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(learning_rate=3e-4), metrics=['accuracy'])
model_2.fit(X, y, batch_size=64, epochs=30)
model_2.fit(X, y, batch_size=64, epochs=15)
model_2.evaluate(X_test, y_test, batch_size=64)
so after I fit both models for 30
epochs on CIFAR10
dsataset, the first model (i.e., the one where I use the keras.Sequential
object to build the model) performes significantly worse then the second model (i.e., the one where I do not use the keras.Sequential
object to build it) model:
For ConvModel_1
Epoch 30/30
782/782 [==============================] - 13s 16ms/step - loss: 1.0373 - accuracy: 0.68
157/157 [==============================] - 1s 7ms/step - loss: 1.2568 - accuracy: 0.61
For ConvModel_2
Epoch 30/30
782/782 [==============================] - 13s 17ms/step - loss: 0.6688 - accuracy: 0.83
157/157 [==============================] - 2s 8ms/step - loss: 1.0541 - accuracy: 0.72
This doesn’t make sense to me, as the models are identical (besides the input layers, which has no trainable parameters anyway):
Model: "sequential"
Layer (type) Output Shape Param #
conv2d (Conv2D) (None, 30, 30, 32) 896
batch_normalization (BatchNo (None, 30, 30, 32) 128
max_pooling2d (MaxPooling2D) (None, 15, 15, 32) 0
conv2d_1 (Conv2D) (None, 11, 11, 64) 51264
batch_normalization_1 (Batch (None, 11, 11, 64) 256
max_pooling2d_1 (MaxPooling2 (None, 5, 5, 64) 0
conv2d_2 (Conv2D) (None, 3, 3, 128) 73856
batch_normalization_2 (Batch (None, 3, 3, 128) 512
flatten (Flatten) (None, 1152) 0
dense (Dense) (None, 64) 73792
dropout (Dropout) (None, 64) 0
dense_1 (Dense) (None, 10) 650
Total params: 201,354
Trainable params: 200,906
Non-trainable params: 448
Model: "model"
Layer (type) Output Shape Param #
input_3 (InputLayer) [(None, 32, 32, 3)] 0
conv2d_3 (Conv2D) (None, 30, 30, 32) 896
batch_normalization_3 (Batch (None, 30, 30, 32) 128
tf.nn.relu (TFOpLambda) (None, 30, 30, 32) 0
max_pooling2d_2 (MaxPooling2 (None, 15, 15, 32) 0
conv2d_4 (Conv2D) (None, 11, 11, 64) 51264
batch_normalization_4 (Batch (None, 11, 11, 64) 256
tf.nn.relu_1 (TFOpLambda) (None, 11, 11, 64) 0
max_pooling2d_3 (MaxPooling2 (None, 5, 5, 64) 0
conv2d_5 (Conv2D) (None, 3, 3, 128) 73856
batch_normalization_5 (Batch (None, 3, 3, 128) 512
tf.nn.relu_2 (TFOpLambda) (None, 3, 3, 128) 0
flatten_1 (Flatten) (None, 1152) 0
dense_2 (Dense) (None, 64) 73792
dropout_1 (Dropout) (None, 64) 0
dense_3 (Dense) (None, 10) 650
Total params: 201,354
Trainable params: 200,906
Non-trainable params: 448
Just to note: I’m aware of the fact that each time the weights are randomly initialized, but the behavior described above is rather consistent over 10 runs, which renders this possibility quite not likely.
What am I missing here?
Thanks in advance for any help.