Multi GPU and TensorFlow MirroredStrategy

Hello everyone!
First of all thank you for reading this and trying to help.
I have been stuck in the last two weeks trying to decrease the training time for my convolution neural network by using multiple GPUs. However, for some unknown reason (to me!) the ETA increases when I use multi GPU and MirroredStrategy.

The project I am working on is an image segmentation problem with 10 labels and I have around 15,000 train samples. Below I copy the part of the code when I initialize the GPU and defined my data_loader. Any help would be greatly appreciated. I am not sure if the problem is with the GPU set up or data_loader.

# Define the data_loader class:
class CustomGenerator(keras.utils.Sequence):
    """
    CustomGenerator Data loader/generator

    Custom data loader/generator used to load inputs from disk into RAM and GPU VRAM during training

    Parameters
    ----------
    keras : keras.utils.Sequence
        Inherited keras Sequence class
    """

    def __init__(self,
                 input_paths: List[str],
                 batch_size: int,
                 shuffle: bool = True):
        """
        __init__ Class constructor

        Parameters
        ----------
        input_paths : List[str]
            List of file paths to each input (files should contain a single sample)
        batch_size : int
            Batch size to use when retrieving input
        shuffle : bool, optional
            Option to shuffle input samples, by default True
        """
        self.input_paths = input_paths
        self.batch_size = batch_size

        if shuffle:
            random.shuffle(self.input_paths)

    def __len__(self) -> int:
        """
        __len__ Get number of batches based on batch size

        Returns
        -------
        int
            Total number of batches
        """
        return len(self.input_paths) // int(self.batch_size)

    def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        __getitem__ Get item

        Returns a batch based on index argument

        Parameters
        ----------
        idx : int
            Index of batch to return

        Returns
        -------
        Tuple[np.ndarray, np.ndarray]
            (Input, label) pair
        """
        batch_x = self.input_paths[idx * self.batch_size:(idx + 1) *
                                   self.batch_size]

        X = []
        y = []
        for i in range(self.batch_size):
            arr = np.load(batch_x[i], allow_pickle=True)
            X.append(arr[0])
            y.append(arr[1])
        
        y = [a - 1 for a in y]
        X = np.array(X)
        y = np.array(y)
        y = to_categorical(y)
        if y.shape[-1]!=9:
            y1 = np.zeros([self.batch_size,8,512,512,9-y.shape[-1]])
            y = np.concatenate((y,y1),axis=4)

        return X, y

# Setting up GPUs
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
tf.config.optimizer.set_experimental_options({"layout_optimizer": False})

os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

policy = mixed_precision.Policy("mixed_float16")
mixed_precision.set_global_policy(policy)

gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

print(tf.config.list_physical_devices("GPU"))

#%% Defining GPU strategy
strategy = tf.distribute.MirroredStrategy()
print(strategy.num_replicas_in_sync)
num_filters_base = 8
dropout_rate = 0.2
learning_rate = 0.001
batch_size = 6
global_batch_size = batch_size*strategy.num_replicas_in_sync


#%% Training the model
with strategy.scope():
    model = unet_conv3d((12, 512, 512, 4),
                                num_filters_base=4,
                                dropout_rate=0.2)
    model.compile(
        loss="categorical_crossentropy",
        optimizer=keras.optimizers.Adam(learning_rate=0.0001),
        metrics=[tf.keras.metrics.Recall()])


checkpoint_directory = "/panfs/jay/groups/0/ebtehaj/rahim035/paper_2/V2/Results/Model"

checkpoint_filepath = f"{checkpoint_directory}/script_n2.h5"
callbacks = [
    EarlyStopping(patience=25, verbose=1),
    ReduceLROnPlateau(factor=0.1, patience=10, min_lr=1e-16,
                      verbose=1),
    ModelCheckpoint(filepath=checkpoint_filepath,
                    verbose=1,
                    monitor="val_loss",
                    save_best_only=True,
                    save_weights_only=True)
]

print("Starting fit")

results = model.fit(train_dataset,
                                batch_size=batch_size,
                                epochs=128,
                                callbacks=callbacks,
                                verbose=1,
                                validation_data = val_dataset)

Hello @Reyhane_Rahimi

Thank you for using TensorFlow
In the code you defined global_batch_size = batch_size * strategy.num_replicas_in_sync
but not used anywhere so please confirm this again, Try to take smaller subset of the dataset and run the training after that run on single GPU to benchmark the hyperparameters and then use Distributed strategy to optimize training.
Initial checks would be to see the TensorFlow configuration with GPU and confirming TF could access all the GPU’s.