[Voice Recognition] How can I use the model?

I’m no audio expert, but voice recognition is one of those fields that I really wanted to explore.

I been following the tensorflow wiki regarding this matter.

In order to create a model I used my personal recordings with some of my friends. I notice that the model could only receive audio samples with size 31.2kb(format.wav)

After training this model, I successfully saved it along with the classes with the following code:

#Saving model
print("Saving model and label")
f = open("models/labels_"+str(test_acc)+"_"+'{:%Y-%m-%d}'.format(datetime.datetime.now())+".pickle", "wb")

Now I wanted to be able to given a 5 mins audio file or a stream of audio, use this model to verify if a given word was said any ideas how could this be accomplished?

Currently, I’m abit clueless what to do now. The following code is able to currently classify a 31.2kb, but what I really want is to classify bigger audio files and a better way to use the model.

import os
import pathlib

import datetime
import pickle
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

def get_waveform_and_label(file_path):
  path = tf.strings.split(file_path, os.path.sep)
  label = path[-2]

  #Decode Audio
  audio_binary = tf.io.read_file(file_path)
  audio, _ = tf.audio.decode_wav(audio_binary)
  waveform = tf.squeeze(audio, axis=-1)

  return waveform, label

def get_spectrogram(waveform):
  # Padding for files with less than 16000 samples
  zero_padding = tf.zeros([16000] - tf.shape(waveform), dtype=tf.float32)

  # Concatenate audio with padding so that all audio clips will be of the 
  # same length
  waveform = tf.cast(waveform, tf.float32)
  equal_length = tf.concat([waveform, zero_padding], 0)
  spectrogram = tf.signal.stft(
      equal_length, frame_length=255, frame_step=128)

  spectrogram = tf.abs(spectrogram)

  return spectrogram

def get_spectrogram_and_label_id(audio, label):
  spectrogram = get_spectrogram(audio)
  spectrogram = tf.expand_dims(spectrogram, -1)
  return spectrogram

def preprocess_dataset(files):
  files_ds = tf.data.Dataset.from_tensor_slices(files)
  output_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=AUTOTUNE) 
  output_ds = output_ds.map(get_spectrogram_and_label_id,  num_parallel_calls=AUTOTUNE)
  return output_ds #return tuple (spetogram, label)

print("Loading model")
model = tf.keras.models.load_model('models/model_0.8_2021-10-04.h5')
CLASS_NAMES = pickle.loads(open("models/labels_0.8_2021-10-04.pickle", "rb").read())

data_dir = pathlib.Path('recordings')
if not data_dir.exists():
    print("Unable to load recordings")

#sample_file = data_dir/'kekeres/testB.wav'
sample_file = data_dir/'kekeres/kekeres_1633285520639.wav'
sample_ds = preprocess_dataset([str(sample_file)])

for spectrogram in sample_ds.batch(1):
  prediction = model(spectrogram)
  prediction_data = tf.nn.softmax(prediction, axis=1).numpy()

  index = tf.argmax(prediction, axis=1) 
  print("class: "+str(CLASS_NAMES[np.array(index)]))
  print("confidence: "+str(prediction_data[0][np.array(index)]))

Can you split 5min wave into smaller parts and then feed it to network?