I need to detect silence (slight noise, not absolute silence) in the wave file. All the wave files (training & detection) are 16-bit and mono.
Here is the training script that processes all the silence files in the given directory. The sound file is divided into a 0.1-second block (1600 frames) as training data and for feature detection, Mel Frequency Cepstrum Coefficient (MFCC) or Short-time Fourier transform (STFT) can be used. (I have tried both)
Here is the training script
# train silence model using tensorflow
import glob
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
os.environ["TF_ENABLE_ONEDNN_OPTS"] = '0'
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import os
import librosa
# Function to extract MFCC features from audio files
def extract_features(file_path, mfcc=True, hop_length=512, n_mfcc=13):
signal, sr = librosa.load(file_path, sr=None)
block_size = 1600 # sr / 10 for 0.1 seconds
num_blocks = len(signal) // block_size
features = []
for i in range(num_blocks):
block = signal[i * block_size: (i + 1) * block_size]
if mfcc:
mfccs = librosa.feature.mfcc(y=block, sr=sr, n_fft=1024, hop_length=hop_length, n_mfcc=n_mfcc)
features.append(mfccs.T)
else:
features.append(np.abs(librosa.stft(block, n_fft=1024, hop_length=hop_length)))
return np.array(features)
# Directory containing silence and noise sound files
silence_files = glob.glob('sounds/silence/silence*.wav')
# Extract features for all files
X = []
y = []
for file in silence_files:
features = extract_features(file, mfcc=True)
X.extend(features)
y.extend([0] * len(features)) # Assuming silence files are labeled as 0
# Convert lists to arrays
X = np.array(X)
y = np.array(y)
# Define and compile the model
model = models.Sequential([
layers.Input(shape=X[0].shape),
layers.Reshape(target_shape=(*X[0].shape, 1)), # Reshape to include channel dimension
layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),
layers.MaxPooling2D(pool_size=(2, 2)),
layers.Flatten(),
layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
# Train the model
model.fit(X, y, epochs=10, batch_size=32)
# Save the model to an external file
model.save("models/silence_model.keras")
and here is the detection script
# detection of silence using tensorflow model
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
os.environ["TF_ENABLE_ONEDNN_OPTS"] = '0'
import soundfile as sf
import tensorflow as tf
import librosa
import numpy as np
# Function to extract MFCC features from audio frames
def extract_features_from_block(block, mfcc=True, hop_length=512, n_mfcc=13):
if mfcc:
mfccs = librosa.feature.mfcc(y=block, sr=16000, n_fft=1024, hop_length=hop_length, n_mfcc=n_mfcc)
# Reshape to match model input shape
mfccs = mfccs.reshape(1, mfccs.shape[0], mfccs.shape[1], 1)
return mfccs
else:
stft = librosa.stft(block, n_fft=1024, hop_length=hop_length)
# Reshape to match model input shape
stft = stft.reshape(1, stft.shape[0], stft.shape[1], 1)
return np.abs(stft)
def remove_silence(input_file, output_file, model, threshold=0.5):
signal, sr = librosa.load(input_file, sr=None)
block_size = 1600 # sr / 10 for 0.1 seconds
num_blocks = len(signal) // block_size
output_signal = np.array([])
for i in range(num_blocks):
block = signal[i * block_size: (i + 1) * block_size]
feature = extract_features_from_block(block, mfcc=True)
prediction = model.predict(feature)[0][0]
print(prediction)
if prediction < threshold:
# Add complete silence block
output_signal = np.concatenate((output_signal, np.zeros_like(block)))
else:
# Add non-silence block
output_signal = np.concatenate((output_signal, block))
# Write processed signal to output file
sf.write(output_file, output_signal, sr)
# Load the saved model
model = tf.keras.models.load_model("models/silence_model.keras")
# Usage example
remove_silence("./sounds/slience_test.wav", "output_file.wav", model)
The problem in detection is that prediction is coming out almost equal to 0.0 for every block. Here is the link to download 3 silence files for training and one for testing. SILENCE.ZIP