Hello everyone!
First of all thank you for reading this and trying to help.
I have been stuck in the last two weeks trying to decrease the training time for my convolution neural network by using multiple GPUs. However, for some unknown reason (to me!) the ETA increases when I use multi GPU and MirroredStrategy.
The project I am working on is an image segmentation problem with 10 labels and I have around 15,000 train samples. Below I copy the part of the code when I initialize the GPU and defined my data_loader. Any help would be greatly appreciated. I am not sure if the problem is with the GPU set up or data_loader.
# Define the data_loader class:
class CustomGenerator(keras.utils.Sequence):
"""
CustomGenerator Data loader/generator
Custom data loader/generator used to load inputs from disk into RAM and GPU VRAM during training
Parameters
----------
keras : keras.utils.Sequence
Inherited keras Sequence class
"""
def __init__(self,
input_paths: List[str],
batch_size: int,
shuffle: bool = True):
"""
__init__ Class constructor
Parameters
----------
input_paths : List[str]
List of file paths to each input (files should contain a single sample)
batch_size : int
Batch size to use when retrieving input
shuffle : bool, optional
Option to shuffle input samples, by default True
"""
self.input_paths = input_paths
self.batch_size = batch_size
if shuffle:
random.shuffle(self.input_paths)
def __len__(self) -> int:
"""
__len__ Get number of batches based on batch size
Returns
-------
int
Total number of batches
"""
return len(self.input_paths) // int(self.batch_size)
def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
"""
__getitem__ Get item
Returns a batch based on index argument
Parameters
----------
idx : int
Index of batch to return
Returns
-------
Tuple[np.ndarray, np.ndarray]
(Input, label) pair
"""
batch_x = self.input_paths[idx * self.batch_size:(idx + 1) *
self.batch_size]
X = []
y = []
for i in range(self.batch_size):
arr = np.load(batch_x[i], allow_pickle=True)
X.append(arr[0])
y.append(arr[1])
y = [a - 1 for a in y]
X = np.array(X)
y = np.array(y)
y = to_categorical(y)
if y.shape[-1]!=9:
y1 = np.zeros([self.batch_size,8,512,512,9-y.shape[-1]])
y = np.concatenate((y,y1),axis=4)
return X, y
# Setting up GPUs
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
tf.config.optimizer.set_experimental_options({"layout_optimizer": False})
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
policy = mixed_precision.Policy("mixed_float16")
mixed_precision.set_global_policy(policy)
gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
print(tf.config.list_physical_devices("GPU"))
#%% Defining GPU strategy
strategy = tf.distribute.MirroredStrategy()
print(strategy.num_replicas_in_sync)
num_filters_base = 8
dropout_rate = 0.2
learning_rate = 0.001
batch_size = 6
global_batch_size = batch_size*strategy.num_replicas_in_sync
#%% Training the model
with strategy.scope():
model = unet_conv3d((12, 512, 512, 4),
num_filters_base=4,
dropout_rate=0.2)
model.compile(
loss="categorical_crossentropy",
optimizer=keras.optimizers.Adam(learning_rate=0.0001),
metrics=[tf.keras.metrics.Recall()])
checkpoint_directory = "/panfs/jay/groups/0/ebtehaj/rahim035/paper_2/V2/Results/Model"
checkpoint_filepath = f"{checkpoint_directory}/script_n2.h5"
callbacks = [
EarlyStopping(patience=25, verbose=1),
ReduceLROnPlateau(factor=0.1, patience=10, min_lr=1e-16,
verbose=1),
ModelCheckpoint(filepath=checkpoint_filepath,
verbose=1,
monitor="val_loss",
save_best_only=True,
save_weights_only=True)
]
print("Starting fit")
results = model.fit(train_dataset,
batch_size=batch_size,
epochs=128,
callbacks=callbacks,
verbose=1,
validation_data = val_dataset)