Train a tensorflow model to detect silence in .wav file

Prashant_Saxena · March 20, 2024, 5:08am

I need to detect silence (slight noise, not absolute silence) in the wave file. All the wave files (training & detection) are 16-bit and mono.

Here is the training script that processes all the silence files in the given directory. The sound file is divided into a 0.1-second block (1600 frames) as training data and for feature detection, Mel Frequency Cepstrum Coefficient (MFCC) or Short-time Fourier transform (STFT) can be used. (I have tried both)

Here is the training script

# train silence model using tensorflow
import glob
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 
os.environ["TF_ENABLE_ONEDNN_OPTS"] = '0'

import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import os
import librosa

# Function to extract MFCC features from audio files
def extract_features(file_path, mfcc=True, hop_length=512, n_mfcc=13):
    signal, sr = librosa.load(file_path, sr=None)
    block_size = 1600 # sr / 10 for 0.1 seconds
    num_blocks = len(signal) // block_size
    
    features = []
    for i in range(num_blocks):
        block = signal[i * block_size: (i + 1) * block_size]
        if mfcc:
            mfccs = librosa.feature.mfcc(y=block, sr=sr, n_fft=1024, hop_length=hop_length, n_mfcc=n_mfcc)
            features.append(mfccs.T)
        else:
            features.append(np.abs(librosa.stft(block, n_fft=1024, hop_length=hop_length)))
    
    return np.array(features)

# Directory containing silence and noise sound files
silence_files = glob.glob('sounds/silence/silence*.wav')

# Extract features for all files
X = []
y = []
for file in silence_files:
    features = extract_features(file, mfcc=True)
    X.extend(features)
    y.extend([0] * len(features))  # Assuming silence files are labeled as 0

# Convert lists to arrays
X = np.array(X)
y = np.array(y)

# Define and compile the model
model = models.Sequential([
    layers.Input(shape=X[0].shape),
    layers.Reshape(target_shape=(*X[0].shape, 1)),  # Reshape to include channel dimension
    layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=10, batch_size=32)

# Save the model to an external file
model.save("models/silence_model.keras")

and here is the detection script

# detection of silence using tensorflow model
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 
os.environ["TF_ENABLE_ONEDNN_OPTS"] = '0'
import soundfile as sf

import tensorflow as tf
import librosa
import numpy as np

# Function to extract MFCC features from audio frames
def extract_features_from_block(block, mfcc=True, hop_length=512, n_mfcc=13):
    if mfcc:
        mfccs = librosa.feature.mfcc(y=block, sr=16000, n_fft=1024, hop_length=hop_length, n_mfcc=n_mfcc)
        # Reshape to match model input shape
        mfccs = mfccs.reshape(1, mfccs.shape[0], mfccs.shape[1], 1)
        return mfccs
    else:
        stft = librosa.stft(block, n_fft=1024, hop_length=hop_length)
        # Reshape to match model input shape
        stft = stft.reshape(1, stft.shape[0], stft.shape[1], 1)
        return np.abs(stft)


def remove_silence(input_file, output_file, model, threshold=0.5):
    signal, sr = librosa.load(input_file, sr=None)
    block_size = 1600 # sr / 10 for 0.1 seconds
    num_blocks = len(signal) // block_size
    
    output_signal = np.array([])
    
    for i in range(num_blocks):
        block = signal[i * block_size: (i + 1) * block_size]
        feature = extract_features_from_block(block, mfcc=True)
        prediction = model.predict(feature)[0][0]
        print(prediction)
        if prediction < threshold:
            # Add complete silence block
            output_signal = np.concatenate((output_signal, np.zeros_like(block)))
        else:
            # Add non-silence block
            output_signal = np.concatenate((output_signal, block))

    # Write processed signal to output file
    sf.write(output_file, output_signal, sr)

# Load the saved model
model = tf.keras.models.load_model("models/silence_model.keras")

# Usage example
remove_silence("./sounds/slience_test.wav", "output_file.wav", model)

The problem in detection is that prediction is coming out almost equal to 0.0 for every block. Here is the link to download 3 silence files for training and one for testing. SILENCE.ZIP