Keras gradio audio classification output error

Cuba_Try · November 8, 2023, 11:38am

I already built a keras model that can will detect the conditions by their audio using microphone, and I want to try the model by using it with gradio interface but unsuccessful.

this is my model architecture:

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.utils import to_categorical

input_shape = (spectrograms.shape[1], 1)

model = Sequential()

model.add(Conv1D(32, 3, padding='valid', strides=1, input_shape=input_shape, activation='relu'))
model.add(MaxPooling1D(pool_size=4, strides=3))
model.add(Conv1D(32, 1, padding='valid', strides=1, activation='relu'))
model.add(MaxPooling1D(pool_size=1, strides=3))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='sigmoid'))  # Assuming 2 classes for binary classification encoded
#model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', 
                       tf.keras.metrics.Precision(), 
                       tf.keras.metrics.Recall(), 
                       tf.keras.metrics.F1Score(), 
                       tf.keras.metrics.TrueNegatives(), 
                       tf.keras.metrics.TruePositives(), 
                       tf.keras.metrics.FalseNegatives(), 
                       tf.keras.metrics.FalsePositives()])

and from this point

import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from keras.utils import to_categorical

# Load audio data and create spectrograms
def predict_conditions(audio):
    audio_data, _ = librosa.load(audio, sr=sample_rate, duration=duration)
    sample_rate=22050, 
    duration=4, 
    hop_length=512, 
    n_mels=128
    
    plt.figure(figsize=(10, 4))
    mel_spectrogram = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate, hop_length=hop_length, n_mels=n_mels)
    librosa.display.specshow(librosa.power_to_db(mel_spectrogram, ref=np.max), y_axis='mel', x_axis='time')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel Spectrogram')
    
    # Extract the file name without extension
    file_name = os.path.splitext(os.path.basename(audio_path))[0]
    
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Specify the path to save the spectrogram image
    image_path = os.path.join(output_dir, f'{file_name}_spectrogram.png')

    # Save the spectrogram as an image
    plt.savefig(image_path, bbox_inches='tight', pad_inches=0)
    plt.close()

    output_dir = 'D:/dataset/something/spectrogram'

    return mel_spectrogram, image_path

# Define the directory where your spectrogram images are stored
output_dir = 'D:/dataset/something/spectrogram'

# Create empty lists to store spectrograms 
spectrograms = []

# Function to load spectrogram from image path
def load_spectrogram(image_path, sample_rate=22050, duration=4, hop_length=512, n_mels=128):
  # Load the image 
  image = plt.imread(image_path)

  # Convert the image to grayscale if needed
  if len(image.shape) == 3:
    image = image.mean(axis=-1)

  # Perform inverse of the preprocessing steps in create_spectrogram
  mel_spectrogram = librosa.feature.inverse.mel_to_stft(image)
  audio_data = librosa.feature.inverse.mel_to_audio(mel_spectrogram, sr=sample_rate, hop_length=hop_length)

  return audio_data

# Load spectrograms and labels
for root, dirs, files in os.walk(spectrogram_dir):
  for file in files:
    if file.endswith("_spectrogram.png"):
      image_path = os.path.join(root, file)
      spectrogram = load_spectrogram(image_path)  # Load the spectrogram image
      spectrograms.append(spectrogram)

# Prepare the spectrogram
spectrograms = np.array(spectrograms)

until this point above is my pre-processing steps. And because of that, I thought if I want to test it with gradio, the data needs to be undergoing the similar process as the training. The only difference is that, the training will load the data from directory but for the gradio will take the audio directly from mic. It continues with code below to display the gradio interface. Anyone know what I missed?

def convert_predictions_to_binary(predictions):
    average_prediction = np.mean(predictions)
    threshold = 0.5
    return "okay" if average_prediction > threshold else "not okay"

predictions = model.predict(spectrograms)

# Make binary predictions for each spectrogram and print them to the console.
for prediction in predictions:
    binary_prediction = convert_predictions_to_binary(prediction)
    print(binary_prediction)


test = gr.Interface(fn=predict_conditions, inputs="microphone", outputs = gr.Label(num_top_classes=2),
                    title=title, description=description)

if __name__ == "__main__":
    test.launch()

Sorry for inconvenience occurs, I’m just a newbie here. I greatly appreciate the help