Multi GPU and TensorFlow MirroredStrategy

Hello everyone!
First of all thank you for reading this and trying to help.
I have been stuck in the last two weeks trying to decrease the training time for my convolution neural network by using multiple GPUs. However, for some unknown reason (to me!) the ETA increases when I use multi GPU and MirroredStrategy.

The project I am working on is an image segmentation problem with 10 labels and I have around 15,000 train samples. Below I copy the part of the code when I initialize the GPU and defined my data_loader. Any help would be greatly appreciated. I am not sure if the problem is with the GPU set up or data_loader.

# Define the data_loader class:
class CustomGenerator(keras.utils.Sequence):
    """
    CustomGenerator Data loader/generator

    Custom data loader/generator used to load inputs from disk into RAM and GPU VRAM during training

    Parameters
    ----------
    keras : keras.utils.Sequence
        Inherited keras Sequence class
    """

    def __init__(self,
                 input_paths: List[str],
                 batch_size: int,
                 shuffle: bool = True):
        """
        __init__ Class constructor

        Parameters
        ----------
        input_paths : List[str]
            List of file paths to each input (files should contain a single sample)
        batch_size : int
            Batch size to use when retrieving input
        shuffle : bool, optional
            Option to shuffle input samples, by default True
        """
        self.input_paths = input_paths
        self.batch_size = batch_size

        if shuffle:
            random.shuffle(self.input_paths)

    def __len__(self) -> int:
        """
        __len__ Get number of batches based on batch size

        Returns
        -------
        int
            Total number of batches
        """
        return len(self.input_paths) // int(self.batch_size)

    def __getitem__(self, idx: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        __getitem__ Get item

        Returns a batch based on index argument

        Parameters
        ----------
        idx : int
            Index of batch to return

        Returns
        -------
        Tuple[np.ndarray, np.ndarray]
            (Input, label) pair
        """
        batch_x = self.input_paths[idx * self.batch_size:(idx + 1) *
                                   self.batch_size]

        X = []
        y = []
        for i in range(self.batch_size):
            arr = np.load(batch_x[i], allow_pickle=True)
            X.append(arr[0])
            y.append(arr[1])
        
        y = [a - 1 for a in y]
        X = np.array(X)
        y = np.array(y)
        y = to_categorical(y)
        if y.shape[-1]!=9:
            y1 = np.zeros([self.batch_size,8,512,512,9-y.shape[-1]])
            y = np.concatenate((y,y1),axis=4)

        return X, y

# Setting up GPUs
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
tf.config.optimizer.set_experimental_options({"layout_optimizer": False})

os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

policy = mixed_precision.Policy("mixed_float16")
mixed_precision.set_global_policy(policy)

gpus = tf.config.experimental.list_physical_devices(device_type="GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

print(tf.config.list_physical_devices("GPU"))

#%% Defining GPU strategy
strategy = tf.distribute.MirroredStrategy()
print(strategy.num_replicas_in_sync)
num_filters_base = 8
dropout_rate = 0.2
learning_rate = 0.001
batch_size = 6
global_batch_size = batch_size*strategy.num_replicas_in_sync


#%% Training the model
with strategy.scope():
    model = unet_conv3d((12, 512, 512, 4),
                                num_filters_base=4,
                                dropout_rate=0.2)
    model.compile(
        loss="categorical_crossentropy",
        optimizer=keras.optimizers.Adam(learning_rate=0.0001),
        metrics=[tf.keras.metrics.Recall()])


checkpoint_directory = "/panfs/jay/groups/0/ebtehaj/rahim035/paper_2/V2/Results/Model"

checkpoint_filepath = f"{checkpoint_directory}/script_n2.h5"
callbacks = [
    EarlyStopping(patience=25, verbose=1),
    ReduceLROnPlateau(factor=0.1, patience=10, min_lr=1e-16,
                      verbose=1),
    ModelCheckpoint(filepath=checkpoint_filepath,
                    verbose=1,
                    monitor="val_loss",
                    save_best_only=True,
                    save_weights_only=True)
]

print("Starting fit")

results = model.fit(train_dataset,
                                batch_size=batch_size,
                                epochs=128,
                                callbacks=callbacks,
                                verbose=1,
                                validation_data = val_dataset)