Hi everyone!
I’m trying to do a binary classification on a very unbalanced dataset.
The model is doing great, but after some random epochs the loss becomes nan, also precision, recall, TP, and FP, all become ZERO.
Sometimes it happens after the 3rd epoch, sometimes after the 20th epoch.
The code:
import numpy as np
import os
from keras import regularizers
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
import pandas
nodules_csv = pandas.read_csv("/cropped_nodules.csv")
base_dir = "/cropped_nodules/"
all_image_paths = os.listdir(base_dir)
all_image_paths = sorted(all_image_paths,key=lambda x: int(os.path.splitext(x)[0]))
nodules = nodules_csv.rename(columns = {'SN':'ID'})
labels= nodules.iloc[:,1]
labels = labels.to_numpy()
class DataGenerator(Sequence):
# Learned from https://mahmoudyusof.github.io/facial-keypoint-detection/data-generator/
def __init__(self, all_image_paths, labels, base_dir, output_size, shuffle=False, batch_size=10):
"""
Initializes a data generator object
:param csv_file: file in which image names and numeric labels are stored
:param base_dir: the directory in which all images are stored
:param output_size: image output size after preprocessing
:param shuffle: shuffle the data after each epoch
:param batch_size: The size of each batch returned by __getitem__
"""
self.imgs = all_image_paths
self.base_dir = base_dir
self.output_size = output_size
self.shuffle = shuffle
self.batch_size = batch_size
self.labels = labels
self.on_epoch_end()
def on_epoch_end(self):
self.indices = np.arange(len(self.imgs))
if self.shuffle:
np.random.shuffle(self.indices)
def __len__(self):
return int(len(self.imgs) / self.batch_size)
def __getitem__(self, idx):
## Initializing Batch
# that one in the shape is just for a one channel images
# if you want to use colored images you might want to set that to 3
X = np.empty((self.batch_size, *self.output_size,1))
# (x, y, h, w)
y = np.empty((self.batch_size, 1))
# get the indices of the requested batch
indices = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
for i, data_index in enumerate(indices):
img_path = os.path.join(self.base_dir,
self.imgs[data_index])
img = np.load(img_path)
while img.shape == (31,31,31):
img = np.expand_dims(img, axis=3)
## this is where you preprocess the image
## make sure to resize it to be self.output_size
label = self.labels[data_index]
## if you have any preprocessing for
## the labels too do it here
X[i,] = img
y[i] = label
return X, y
## Defining and training the model
model = Sequential([
## define the model's architecture
layers.Conv3D(filters=32, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.Conv3D(filters=32, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.MaxPool3D(pool_size=2),
layers.BatchNormalization(),
layers.Conv3D(filters=64, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.Conv3D(filters=64, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.MaxPool3D(pool_size=2),
layers.BatchNormalization(),
layers.Conv3D(filters=128, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.Conv3D(filters=128, kernel_size=3, activation="relu",padding='same'),
layers.BatchNormalization(),
layers.MaxPool3D(pool_size=2),
layers.BatchNormalization(),
layers.Conv3D(filters=256, kernel_size=3, activation="relu", padding='same'),
layers.BatchNormalization(),
layers.Conv3D(filters=256, kernel_size=3, activation="relu", padding='same'),
layers.BatchNormalization(),
layers.MaxPool3D(pool_size=2),
layers.BatchNormalization(),
layers.GlobalAveragePooling3D(),
layers.Dense(units=512, activation="relu"),
layers.BatchNormalization(),
layers.Dropout(0.3),
layers.Dense(units=1, activation="sigmoid"),
])
train_gen = DataGenerator(all_image_paths, labels, base_dir, (31, 31, 31), batch_size=128, shuffle=False)
## compile the model first of course
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy', 'Precision', 'Recall', 'FalseNegatives', 'FalsePositives', 'TrueNegatives', 'TruePositives'])
model.build(input_shape= (128,31,31,31,1))
model.summary()
# now let's train the model
history = model.fit(train_gen, epochs=25)
and the results below:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv3d (Conv3D) (128, 31, 31, 31, 32) 896
batch_normalization (BatchN (128, 31, 31, 31, 32) 128
ormalization)
conv3d_1 (Conv3D) (128, 31, 31, 31, 32) 27680
batch_normalization_1 (Batc (128, 31, 31, 31, 32) 128
hNormalization)
max_pooling3d (MaxPooling3D (128, 15, 15, 15, 32) 0
)
batch_normalization_2 (Batc (128, 15, 15, 15, 32) 128
hNormalization)
conv3d_2 (Conv3D) (128, 15, 15, 15, 64) 55360
batch_normalization_3 (Batc (128, 15, 15, 15, 64) 256
hNormalization)
conv3d_3 (Conv3D) (128, 15, 15, 15, 64) 110656
batch_normalization_4 (Batc (128, 15, 15, 15, 64) 256
hNormalization)
max_pooling3d_1 (MaxPooling (128, 7, 7, 7, 64) 0
3D)
batch_normalization_5 (Batc (128, 7, 7, 7, 64) 256
hNormalization)
conv3d_4 (Conv3D) (128, 7, 7, 7, 128) 221312
batch_normalization_6 (Batc (128, 7, 7, 7, 128) 512
hNormalization)
conv3d_5 (Conv3D) (128, 7, 7, 7, 128) 442496
batch_normalization_7 (Batc (128, 7, 7, 7, 128) 512
hNormalization)
max_pooling3d_2 (MaxPooling (128, 3, 3, 3, 128) 0
3D)
batch_normalization_8 (Batc (128, 3, 3, 3, 128) 512
hNormalization)
conv3d_6 (Conv3D) (128, 3, 3, 3, 256) 884992
batch_normalization_9 (Batc (128, 3, 3, 3, 256) 1024
hNormalization)
conv3d_7 (Conv3D) (128, 3, 3, 3, 256) 1769728
batch_normalization_10 (Bat (128, 3, 3, 3, 256) 1024
chNormalization)
max_pooling3d_3 (MaxPooling (128, 1, 1, 1, 256) 0
3D)
batch_normalization_11 (Bat (128, 1, 1, 1, 256) 1024
chNormalization)
global_average_pooling3d (G (128, 256) 0
lobalAveragePooling3D)
dense (Dense) (128, 512) 131584
batch_normalization_12 (Bat (128, 512) 2048
chNormalization)
dropout (Dropout) (128, 512) 0
dense_1 (Dense) (128, 1) 513
=================================================================
Total params: 3,653,025
Trainable params: 3,649,121
Non-trainable params: 3,904
_________________________________________________________________
Epoch 1/25
2022-12-15 17:46:04.897341: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8401
2022-12-15 17:46:05.829836: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-12-15 17:46:06.464508: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-12-15 17:46:07.214021: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x2319ed30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-12-15 17:46:07.214054: I tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2022-12-15 17:46:07.217900: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-12-15 17:46:07.277629: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-12-15 17:46:07.317843: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.
5898/5898 [==============================] - 1184s 199ms/step - loss: 0.0203 - accuracy: 0.9956 - precision: 0.0807 - recall: 0.1113 - false_negatives: 1381.0000 - false_positives: 1972.0000 - true_negatives: 751418.0000 - true_positives: 173.0000
Epoch 2/25
5898/5898 [==============================] - 1178s 200ms/step - loss: 0.0068 - accuracy: 0.9984 - precision: 0.6869 - recall: 0.3779 - false_negatives: 968.0000 - false_positives: 268.0000 - true_negatives: 753120.0000 - true_positives: 588.0000
Epoch 3/25
5898/5898 [==============================] - 1178s 200ms/step - loss: 0.0052 - accuracy: 0.9986 - precision: 0.7472 - recall: 0.4782 - false_negatives: 813.0000 - false_positives: 252.0000 - true_negatives: 753134.0000 - true_positives: 745.0000
Epoch 4/25
5898/5898 [==============================] - 1181s 200ms/step - loss: 0.0045 - accuracy: 0.9987 - precision: 0.7676 - recall: 0.5540 - false_negatives: 694.0000 - false_positives: 261.0000 - true_negatives: 753127.0000 - true_positives: 862.0000
Epoch 5/25
5898/5898 [==============================] - 1181s 200ms/step - loss: 0.0039 - accuracy: 0.9988 - precision: 0.7913 - recall: 0.5963 - false_negatives: 629.0000 - false_positives: 245.0000 - true_negatives: 753141.0000 - true_positives: 929.0000
Epoch 6/25
5898/5898 [==============================] - 1178s 200ms/step - loss: 0.0033 - accuracy: 0.9990 - precision: 0.8080 - recall: 0.6465 - false_negatives: 550.0000 - false_positives: 239.0000 - true_negatives: 753149.0000 - true_positives: 1006.0000
Epoch 7/25
5898/5898 [==============================] - 1178s 200ms/step - loss: 0.0029 - accuracy: 0.9990 - precision: 0.8178 - recall: 0.6913 - false_negatives: 481.0000 - false_positives: 240.0000 - true_negatives: 753146.0000 - true_positives: 1077.0000
Epoch 8/25
5898/5898 [==============================] - 1181s 200ms/step - loss: 0.0024 - accuracy: 0.9992 - precision: 0.8452 - recall: 0.7530 - false_negatives: 385.0000 - false_positives: 215.0000 - true_negatives: 753170.0000 - true_positives: 1174.0000
Epoch 9/25
5898/5898 [==============================] - 1177s 200ms/step - loss: 0.0018 - accuracy: 0.9993 - precision: 0.8632 - recall: 0.8077 - false_negatives: 299.0000 - false_positives: 199.0000 - true_negatives: 753190.0000 - true_positives: 1256.0000
Epoch 10/25
5898/5898 [==============================] - 1180s 200ms/step - loss: 0.0014 - accuracy: 0.9995 - precision: 0.9055 - recall: 0.8508 - false_negatives: 232.0000 - false_positives: 138.0000 - true_negatives: 753251.0000 - true_positives: 1323.0000
Epoch 11/25
5898/5898 [==============================] - 1181s 200ms/step - loss: 0.0014 - accuracy: 0.9995 - precision: 0.9086 - recall: 0.8678 - false_negatives: 206.0000 - false_positives: 136.0000 - true_negatives: 753250.0000 - true_positives: 1352.0000
Epoch 12/25
5898/5898 [==============================] - 1178s 200ms/step - loss: 0.0011 - accuracy: 0.9996 - precision: 0.9207 - recall: 0.8952 - false_negatives: 163.0000 - false_positives: 120.0000 - true_negatives: 753268.0000 - true_positives: 1393.0000
Epoch 13/25
5898/5898 [==============================] - 1182s 200ms/step - loss: 8.5650e-04 - accuracy: 0.9997 - precision: 0.9382 - recall: 0.9177 - false_negatives: 128.0000 - false_positives: 94.0000 - true_negatives: 753294.0000 - true_positives: 1428.0000
Epoch 14/25
5898/5898 [==============================] - 1179s 200ms/step - loss: 7.9298e-04 - accuracy: 0.9998 - precision: 0.9509 - recall: 0.9326 - false_negatives: 105.0000 - false_positives: 75.0000 - true_negatives: 753312.0000 - true_positives: 1452.0000
Epoch 15/25
5898/5898 [==============================] - 1179s 200ms/step - loss: 7.1897e-04 - accuracy: 0.9998 - precision: 0.9576 - recall: 0.9422 - false_negatives: 90.0000 - false_positives: 65.0000 - true_negatives: 753322.0000 - true_positives: 1467.0000
Epoch 16/25
5898/5898 [==============================] - 1181s 200ms/step - loss: 6.0985e-04 - accuracy: 0.9998 - precision: 0.9567 - recall: 0.9499 - false_negatives: 78.0000 - false_positives: 67.0000 - true_negatives: 753320.0000 - true_positives: 1479.0000
Epoch 17/25
5898/5898 [==============================] - 1182s 200ms/step - loss: 6.1805e-04 - accuracy: 0.9998 - precision: 0.9648 - recall: 0.9499 - false_negatives: 78.0000 - false_positives: 54.0000 - true_negatives: 753332.0000 - true_positives: 1480.0000
Epoch 18/25
5898/5898 [==============================] - 1182s 200ms/step - loss: 4.7617e-04 - accuracy: 0.9998 - precision: 0.9657 - recall: 0.9595 - false_negatives: 63.0000 - false_positives: 53.0000 - true_negatives: 753336.0000 - true_positives: 1492.0000
Epoch 19/25
5898/5898 [==============================] - 1196s 203ms/step - loss: 5.4637e-04 - accuracy: 0.9998 - precision: 0.9637 - recall: 0.9563 - false_negatives: 68.0000 - false_positives: 56.0000 - true_negatives: 753332.0000 - true_positives: 1488.0000
Epoch 20/25
5898/5898 [==============================] - 1748s 296ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1557.0000 - false_positives: 0.0000e+00 - true_negatives: 753387.0000 - true_positives: 0.0000e+00
Epoch 21/25
5898/5898 [==============================] - 1150s 195ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1557.0000 - false_positives: 0.0000e+00 - true_negatives: 753387.0000 - true_positives: 0.0000e+00
Epoch 22/25
5898/5898 [==============================] - 1145s 194ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1558.0000 - false_positives: 0.0000e+00 - true_negatives: 753386.0000 - true_positives: 0.0000e+00
Epoch 23/25
5898/5898 [==============================] - 1145s 194ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1555.0000 - false_positives: 0.0000e+00 - true_negatives: 753389.0000 - true_positives: 0.0000e+00
Epoch 24/25
5898/5898 [==============================] - 1146s 194ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1558.0000 - false_positives: 0.0000e+00 - true_negatives: 753386.0000 - true_positives: 0.0000e+00
Epoch 25/25
5898/5898 [==============================] - 1148s 195ms/step - loss: nan - accuracy: 0.9979 - precision: 0.0000e+00 - recall: 0.0000e+00 - false_negatives: 1557.0000 - false_positives: 0.0000e+00 - true_negatives: 753387.0000 - true_positives: 0.0000e+00