I’m no audio expert, but voice recognition is one of those fields that I really wanted to explore.
I been following the tensorflow wiki regarding this matter.
In order to create a model I used my personal recordings with some of my friends. I notice that the model could only receive audio samples with size 31.2kb(format.wav)
After training this model, I successfully saved it along with the classes with the following code:
#Saving model
print("Saving model and label")
model.save("models/model_"+str(test_acc)+"_"+'{:%Y-%m-%d}'.format(datetime.datetime.now())+".h5")
f = open("models/labels_"+str(test_acc)+"_"+'{:%Y-%m-%d}'.format(datetime.datetime.now())+".pickle", "wb")
f.write(pickle.dumps(commands))
f.close()
Now I wanted to be able to given a 5 mins audio file or a stream of audio, use this model to verify if a given word was said any ideas how could this be accomplished?
Currently, I’m abit clueless what to do now. The following code is able to currently classify a 31.2kb, but what I really want is to classify bigger audio files and a better way to use the model.
import os
import pathlib
import datetime
import pickle
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
def get_waveform_and_label(file_path):
#Label
path = tf.strings.split(file_path, os.path.sep)
label = path[-2]
#Decode Audio
audio_binary = tf.io.read_file(file_path)
audio, _ = tf.audio.decode_wav(audio_binary)
waveform = tf.squeeze(audio, axis=-1)
return waveform, label
def get_spectrogram(waveform):
# Padding for files with less than 16000 samples
zero_padding = tf.zeros([16000] - tf.shape(waveform), dtype=tf.float32)
# Concatenate audio with padding so that all audio clips will be of the
# same length
waveform = tf.cast(waveform, tf.float32)
equal_length = tf.concat([waveform, zero_padding], 0)
spectrogram = tf.signal.stft(
equal_length, frame_length=255, frame_step=128)
spectrogram = tf.abs(spectrogram)
return spectrogram
def get_spectrogram_and_label_id(audio, label):
spectrogram = get_spectrogram(audio)
spectrogram = tf.expand_dims(spectrogram, -1)
return spectrogram
def preprocess_dataset(files):
files_ds = tf.data.Dataset.from_tensor_slices(files)
output_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=AUTOTUNE)
output_ds = output_ds.map(get_spectrogram_and_label_id, num_parallel_calls=AUTOTUNE)
print(output_ds)
return output_ds #return tuple (spetogram, label)
print("Loading model")
model = tf.keras.models.load_model('models/model_0.8_2021-10-04.h5')
CLASS_NAMES = pickle.loads(open("models/labels_0.8_2021-10-04.pickle", "rb").read())
data_dir = pathlib.Path('recordings')
if not data_dir.exists():
print("Unable to load recordings")
AUTOTUNE = tf.data.AUTOTUNE
#sample_file = data_dir/'kekeres/testB.wav'
sample_file = data_dir/'kekeres/kekeres_1633285520639.wav'
sample_ds = preprocess_dataset([str(sample_file)])
for spectrogram in sample_ds.batch(1):
prediction = model(spectrogram)
prediction_data = tf.nn.softmax(prediction, axis=1).numpy()
index = tf.argmax(prediction, axis=1)
print("class: "+str(CLASS_NAMES[np.array(index)]))
print("confidence: "+str(prediction_data[0][np.array(index)]))