CUDA and cudnn error while training a pix-to-pix GAN using multi-gpu

I’m trying to train a pix to pix gan, but due to huge amount to data training only on 1 gpu is taking weeks of time. However, when I tried to use all my systems GPU’s I’m getting the following error:

2023-02-11 16:58:10.402282: F tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:636] Check failed: cudnnSetTensorNdDescriptor(handle_.get(), elem_type, nd, dims.data(), strides.data()) == CUDNN_STATUS_SUCCESS (3 vs. 0)batch_descriptor: {count: 0 feature_map_count: 256 spatial: 15 15 15 value_min: 0.000000 value_max: 0.000000 layout: BatchDepthYX}
Aborted (core dumped)

Below is the code that I’m using to train the GAN using MultiGPU:

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"] = "1, 2, 3, 4, 5"

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.python.client import device_lib
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
import numpy as np
from numpy import zeros
from numpy import ones
from numpy.random import randint

import skimage.transform as skTrans
import nibabel as nib

strategy = tf.distribute.MirroredStrategy(devices=["/GPU:1", "/GPU:2","/GPU:3", "/GPU:4", "/GPU:5"])
gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.9)
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))


with strategy.scope():
    def ssim(img1, img2):
        mean1 = tf.reduce_mean(img1)
        mean2 = tf.reduce_mean(img2)
        variance1 = tf.reduce_mean(tf.square(img1 - mean1))
        variance2 = tf.reduce_mean(tf.square(img2 - mean2))
        covariance = tf.reduce_mean((img1 - mean1) * (img2 - mean2))
        ssim = (2 * mean1 * mean2 + 0.01) * (2 * covariance + 0.03) / (mean1**2 + mean2**2 + 0.01) / (variance1 + variance2 + 0.03)
        return ssim

    def psnr(img1, img2):
        mse = tf.reduce_mean(tf.square(img1 - img2))
        max_value = tf.reduce_max(img1)
        psnr = 20 * tf.math.log(max_value / tf.sqrt(mse))
        return psnr	

    def Combined_loss(y_true, y_pred):
        ssim_score = 1 - ssim(y_true, y_pred)

        # Calculate PSNR
        psnr_score = 1 - ((psnr(y_true, y_pred))/100.0)

        return (ssim_score + psnr_score)/2.0



    # from matplotlib import pyplot

    def generate_real_samples(dataset, n_samples, patch_shape):
        # choose random instances
        ix = randint(0, 1846, n_samples)
        # retrieve selected images
        X1, X2 = dataset[int(ix)] 
        # generate 'real' class labels (1)
        y = ones((n_samples, patch_shape, patch_shape, 1))
        return [X1, X2], y


    def generate_fake_samples(g_model, samples, patch_shape):
        # generate fake instance
        X = g_model.predict(samples)
        # create 'fake' class labels (0)
        y = zeros((len(X), patch_shape, patch_shape, 1))
        return X, y


    def define_discriminator(image_shape):
        # weight initialization
        init = RandomNormal(stddev=0.02)
        # source image input
        in_src_image = layers.Input(shape=image_shape)
        # target image input
        in_target_image = layers.Input(shape=image_shape)
        # concatenate images channel-wise
        merged = layers.Concatenate()([in_src_image, in_target_image])
        # C64
        d = layers.Conv3D(64, (4,4,4), strides=(2,2,2), padding='same', kernel_initializer=init)(merged)
        d = layers.LeakyReLU(alpha=0.2)(d)
        # C128
        d = layers.Conv3D(128, (4,4,4), strides=(2,2,2), padding='same', kernel_initializer=init)(d)
        d = layers.BatchNormalization()(d)
        d = layers.LeakyReLU(alpha=0.2)(d)
        # C256
        d = layers.Conv3D(256, (4,4, 4), strides=(2,2, 2), padding='same', kernel_initializer=init)(d)
        d = layers.BatchNormalization()(d)
        d = layers.LeakyReLU(alpha=0.2)(d)
        # second last output layer
        d = layers.Conv3D(256, (4,4, 4), padding='same', kernel_initializer=init)(d)
        d = layers.BatchNormalization()(d)
        d = layers.LeakyReLU(alpha=0.2)(d)
        # patch output
        d = layers.Conv3D(1, (4,4, 4), padding='same', kernel_initializer=init)(d)
        patch_out = layers.Activation('sigmoid')(d)
        # define model
        model = Model([in_src_image, in_target_image], patch_out)
        # compile model
        opt = Adam(lr=0.0002, beta_1=0.9, beta_2=0.999)
        model.compile(loss='binary_crossentropy', optimizer=opt, loss_weights=[0.5])
        return model




    # define an encoder block
    def define_encoder_block(layer_in, n_filters, batchnorm=True):
        # weight initialization
        init = RandomNormal(stddev=0.02)
        # add downsampling layer
        g = layers.Conv3D(n_filters, (4,4, 4), strides=(2,2, 2), padding='same', kernel_initializer=init)(layer_in)
        # conditionally add batch normalization
        if batchnorm:
            g = layers.BatchNormalization()(g, training=True)
        # leaky relu activation
        g = layers.LeakyReLU(alpha=0.2)(g)
        return g

    # define a decoder block
    def decoder_block(layer_in, skip_in, n_filters, dropout=True):
        # weight initialization
        init = RandomNormal(stddev=0.02)
        # add upsampling layer
        g = layers.Conv3DTranspose(n_filters, (4,4, 4), strides=(2,2, 2), padding='same', kernel_initializer=init)(layer_in)
        # add batch normalization
        g = layers.BatchNormalization()(g, training=True)
        # conditionally add dropout
        if dropout:
            g = layers.Dropout(0.5)(g, training=True)
        # merge with skip connection
        g = layers.Concatenate()([g, skip_in])
        # relu activation
        g = layers.Activation('relu')(g)
        return g

    # define the standalone generator model
    def define_generator(image_shape):
        # weight initialization
        init = RandomNormal(stddev=0.02)
        # image input
        in_image = layers.Input(shape=image_shape)
        # encoder model
        e1 = define_encoder_block(in_image, 64, batchnorm=False)
        e2 = define_encoder_block(e1, 128)
        e3 = define_encoder_block(e2, 256)
        # bottleneck, no batch norm and relu
        b = layers.Conv3D(256, (4,4,4), strides=(2,2,2), padding='same', kernel_initializer=init)(e3)
        b = layers.Activation('relu')(b)
        # decoder model
        d5 = decoder_block(b, e3, 256, dropout=False)
        d6 = decoder_block(d5, e2, 128, dropout=False)
        d7 = decoder_block(d6, e1, 64, dropout=False)
        # output
        g = layers.Conv3DTranspose(1, (4,4,4), strides=(2,2,2), padding='same', kernel_initializer=init)(d7)
        out_image = layers.Activation('relu')(g)
        # define model
        model = Model(in_image, out_image)
        return model


    def define_gan(g_model, d_model, image_shape):
        # make weights in the discriminator not trainable
        for layer in d_model.layers:
            if not isinstance(layer, layers.BatchNormalization):
                layer.trainable = False
        # define the source image
        in_src = layers.Input(shape=image_shape)
        # connect the source image to the generator input
        gen_out = g_model(in_src)
        # connect the source input and generator output to the discriminator input
        dis_out = d_model([in_src, gen_out])
        # src image as input, generated image and classification output
        model = Model(in_src, [dis_out, gen_out])
        # compile model
        opt = Adam(lr=0.0002*3)
        model.compile(loss=['binary_crossentropy', Combined_loss], optimizer=opt, loss_weights=[1,100])
        return model

    # train pix2pix model
    def train(d_model, g_model, gan_model, dataset, n_epochs=300, n_batch=1):
        # determine the output square shape of the discriminator
        n_patch = d_model.output_shape[1]
        # unpack dataset
        #     trainA, trainB = dataset
        # calculate the number of batches per training epoch
        bat_per_epo = 1846#int(len(trainA) / n_batch)
        # calculate the number of training iterations
        n_steps = bat_per_epo * n_epochs
        print('n_steps: '+str(n_steps))
        # manually enumerate epochs

        for i in range(n_steps):
        # select a batch of real samples
            [X_realA, X_realB], y_real = generate_real_samples(dataset, n_batch, n_patch)
            # generate a batch of fake samples
            X_fakeB, y_fake = generate_fake_samples(g_model, X_realA, n_patch)
            # update discriminator for real samples
            d_loss1 = d_model.train_on_batch([X_realA, X_realB], y_real)
            # update discriminator for generated samples
            d_loss2 = d_model.train_on_batch([X_realA, X_fakeB], y_fake)
            # update the generator
            tg_loss, gen_gan, gen_l1 = gan_model.train_on_batch(X_realA, [y_real, X_realB])
            # summarize performance
            print('>%d, d1[%.3f] d2[%.3f] tot_g[%.3f] gen_gan[%.3f] gen_l1[%.3f]' % (i+1, d_loss1, d_loss2, tg_loss, gen_gan, gen_l1))
            # summarize model performance
            if (i+1) % (bat_per_epo * 1) == 0:
                filename2 = 'model_%06d.h5' % (i+1)
                g_model.save(filename2)
                print('>Saved: %s' % (filename2))



    class CustomSequence(tf.keras.utils.Sequence):  # It inherits from `tf.keras.utils.Sequence` class
        def __init__(self, filenames, batch_size):  # Two input arguments to the class.
            self.filenames= filenames
            self.batch_size = batch_size
            self.root1 = '3d_gan/MC_T1/L0'
            self.root2 = '3d_gan/anat_f'
            self.flist2 = os.listdir('3d_gan/MC_T1/L0')
            self.flist1 = os.listdir('3d_gan/anat_f/')

        def __len__(self):
            return int(np.ceil(len(self.filenames) / float(self.batch_size)))

        def __getitem__(self, idx):  # idx is index that runs from 0 to length of sequence
            impath2 = self.flist2[idx]
            img1 = nib.load(os.path.join(self.root1, impath2)).get_fdata()

            img1 = skTrans.resize(img1, (240,240,240), order=1, preserve_range=True)
            img1 = img1.reshape([1, 240, 240, 240, 1])


            impath1 = self.flist1[idx]
            img2 = nib.load(os.path.join(self.root2, impath1)).get_fdata()

            img2 = skTrans.resize(img2, (240,240,240), order=1, preserve_range=True)
            img2 = (img2 - img2.min())/(img2.max() - img2.min())
            img2 = img2.reshape([1, 240, 240, 240, 1])

            img1 = tf.cast(img1, tf.float32)
            img2 = tf.cast(img2, tf.float32)

            return (img1, img2)



    # Create a strategy that only uses the specified GPU devices

    sequence = CustomSequence(filenames = os.listdir('3d_gan/anat_f/'), batch_size = 1)
    d_model = define_discriminator(image_shape= (240, 240, 240, 1))
    g_model = define_generator(image_shape= (240, 240, 240, 1))
    # define the composite model
    gan_model = define_gan(g_model, d_model, image_shape= (240, 240, 240, 1))

    # train model
    train(d_model, g_model, gan_model, sequence)
    sess.close()

Can anyone suggest changes to make my code train on all 5 GPU’s?
Thank you

Hi @Vishnu_Vardhan_Reddy, Could you please let us know which version of tensorflow and cuda you are using. Thank You.