Tensorflow model (with input mel spectrograms) on kaggle runs on CPU instead of GPU

I am using librosa along with tensorflow. More specifically, I am using a generator, where I convert audio .wav files to log-mel spectrogram, delta and delta-delta.
I have a model consisting of CNN layers.
I run the script in kaggle.
However, my model does not use the GPU. It runs on CPU and is extremely slow.
Here is the code, if anyone can help.
class BerGenerator(tf.keras.utils.Sequence):

    def __init__(self, X_train, y_train, batch_size = 16):
        
        self.batch_size = batch_size
        self.X_train = X_train
        self.y_train = y_train
        self.indexes = np.arange(len(self.X_train))
        self.on_epoch_end()
    
    
    def __len__(self):
        
        return len(self.X_train) // self.batch_size
    
    def __getitem__(self,idx):

        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        batch_audios = self.X_train[indexes]
        batch_labels = self.y_train[indexes]
        X = np.empty([self.batch_size, 224, 224, 3])
        
        for audio in batch_audios:
            i = 0
            #print(audio)
            path = '../input/audio-files/audio_files/' + str(audio)
            x, sr = librosa.load(path)
            ps = librosa.feature.melspectrogram(y=x, sr=sr,n_mels=224, hop_length = 512)
            ps_db= librosa.power_to_db(ps, ref=np.max) #log-mel spectrogram (2d - [n_mels,t])
            delta = librosa.feature.delta(ps_db) #delta (2d)
            delta2 = librosa.feature.delta(ps_db, order=2) #delta-delta (2d)
            ps_db = tf.expand_dims(ps_db, axis = -1)     #3d ([n_mels,t,1])
            delta = tf.expand_dims(delta, axis = -1)     #3d
            delta2 = tf.expand_dims(delta2, axis = -1)   #3d
            final_map = tf.concat([ps_db, delta, delta2], axis=-1)
            resized_map = tf.image.resize(final_map, [224,224]).numpy()   #[224,224,3]
            #print(resized_map)
            X[i,:,:,:] = resized_map                             #[batch_size, 224, 224, 3]
            i = i + 1
       
    
        return np.array(X), np.array(batch_labels) 
    
    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        np.random.RandomState(42).shuffle(self.indexes)

X = dataset['audio']
y = dataset['label']
X = np.array(X.tolist())
y = np.array(y.tolist())

#skf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3)
skf = StratifiedKFold(n_splits = 10)

def create_model():
    
    inp = Input(shape=(224, 224, 3,), dtype="float32")
    img_1 = Conv2D(16, kernel_size=(3, 7), activation=activations.relu)(inp)
    img_1 = Conv2D(16, kernel_size=(3, 7), activation=activations.relu)(img_1)
    img_1 = MaxPool2D(pool_size=(3, 7))(img_1)
    img_1 = Dropout(rate=0.1)(img_1)
    img_1 = Conv2D(32, kernel_size=3, activation=activations.relu)(img_1)
    img_1 = Conv2D(32, kernel_size=3, activation=activations.relu)(img_1)
    img_1 = MaxPool2D(pool_size=(3, 3))(img_1)
    img_1 = Dropout(rate=0.1)(img_1)
    img_1 = Conv2D(128, kernel_size=3, activation=activations.relu)(img_1)
    img_1 = GlobalMaxPool2D()(img_1)
    img_1 = Dropout(rate=0.1)(img_1)

    dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(img_1))
    dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
    dense_1 = Dense(1, activation='sigmoid')(dense_1)

    model = Model(inputs=inp, outputs=dense_1)

    return model

accuracy = []
precision = []
recall = []
auroc = []
specificity = []
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    print("Currently on fold: {}".format(fold))
    
    model = create_model()
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001),loss="binary_crossentropy",metrics=["acc",tf.keras.metrics.Recall(), tf.keras.metrics.Precision(),tf.keras.metrics.AUC(),
                                                                                                                tf.keras.metrics.TrueNegatives(),tf.keras.metrics.FalsePositives()])
    
    train_data = BerGenerator(X_train,y_train,batch_size=2)
    eval_data = BerGenerator(X_eval,y_eval,batch_size = 2)
    history = model.fit(train_data, epochs=1000, verbose = 1, class_weight = class_weight_function(y_train),validation_data=eval_data,callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),
                                                                                                                                                  tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', mode = 'min',
                                                                                                                                                  factor=0.1, patience=3)])
    test_data = BerGenerator(X_test,y_test,batch_size=4)
    result = model.evaluate(test_data, verbose=0)
    print(result[1], result[2],result[3], result[4], result[5]/(result[5]+result[6]))

I get the following warnings:
2021-10-04 20:32:47.111506: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-04 20:32:47.115770: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2000144999 Hz

2021-10-04 20:32:48.883555: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-10-04 20:32:49.779140: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2021-10-04 20:32:53.569330: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8

Thank you.

Hi Loukas,

When I did audio classification, I tried to leave all preprocessing (like librosa and correlated) on preprocessing layers, maybe restructuring the pipeline might fix: Transfer learning with YAMNet for environmental sound classification