Hey everyone, my first post here…
i made a wake word detecting model at teachable machine and it works perfectly as a TensorflowJs model but when I convert it to a .tflite the accuracy goes down (basically stops working) so I converted the tensorflowjs model into a tensorflow saved model using the tensorflow_convertor in WSL. then this tensorflow saved model was converted to a hd5 model, and now I have no idea how to use this model offline.
import time
import numpy as np
import pyaudio
import tensorflow as tf
# Load the saved model
loaded_model = tf.saved_model.load('./keras')
# Get the serving default signature
serving_default = loaded_model.signatures["serving_default"]
# Define parameters for microphone input
FORMAT = pyaudio.paInt16 # Sample format
CHANNELS = 1 # Number of audio channels (1 for mono, 2 for stereo)
RATE = 16000 # Sampling rate (samples per second)
CHUNK = 1024 # Number of frames per buffer
DESIRED_DURATION_SECONDS = 0.6 # Desired duration of audio for each prediction
# Create an instance of PyAudio
audio = pyaudio.PyAudio()
# Open the microphone stream
stream = audio.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("Listening...")
# Start timing
start_time = time.time()
# Buffer to hold microphone input
frames = []
# Function to preprocess audio data
def preprocess_audio(audio_data, target_shape=(43, 232)):
"""
Preprocesses audio data to match the input shape expected by the model.
Args:
- audio_data: Raw audio data as numpy array.
- target_shape: Target shape of the audio data after preprocessing.
Returns:
- Preprocessed audio data as numpy array.
"""
# Perform any necessary preprocessing steps (e.g., normalization)
normalized_audio = audio_data / np.iinfo(np.int16).max # Normalize to range [-1, 1]
# Determine the number of frames per row
frames_per_row = target_shape[1]
# Calculate the number of rows needed to accommodate all frames
num_rows = int(np.ceil(len(normalized_audio) / frames_per_row))
# Pad the audio data if necessary to ensure it fits into the target shape
padded_audio = np.pad(normalized_audio, (0, num_rows * frames_per_row - len(normalized_audio)), mode='constant')
# Reshape the padded audio data to match the target shape
preprocessed_audio = np.reshape(padded_audio, (num_rows, frames_per_row))
# Add batch and channel dimensions
preprocessed_audio = np.expand_dims(preprocessed_audio, axis=0) # Add batch dimension
preprocessed_audio = np.expand_dims(preprocessed_audio, axis=-1) # Add channel dimension
return preprocessed_audio
# Record audio data in chunks and make predictions
try:
while True:
# Read microphone data
data = stream.read(CHUNK)
frames.append(data)
# When enough data is collected (adjust CHUNK size for desired duration)
if len(frames) == int(DESIRED_DURATION_SECONDS * RATE / CHUNK):
# Convert microphone data to numpy array
audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
# Preprocess audio data
preprocessed_audio = preprocess_audio(audio_data)
# Make predictions
predictions = serving_default(conv2d_1_input=tf.constant(preprocessed_audio, dtype=tf.float32))
# Extract probabilities
background_noise_prob = predictions['sequential_3'].numpy()[0][0]
wake_word_prob = predictions['sequential_3'].numpy()[0][1]
# Print results
print("Background Noise Probability:", round(background_noise_prob * 100), "%")
print("Wake Word Probability:", round(wake_word_prob * 100), "%")
# End timing
end_time = time.time()
# Calculate elapsed time
elapsed_time_ms = (end_time - start_time) * 1000
# Print elapsed time
print("Time Taken:", round(elapsed_time_ms, 1), "ms.")
# Reset frames for the next iteration
frames = []
except KeyboardInterrupt:
# Stop the stream when Ctrl+C is pressed
print("Stopped recording.")
# Close the stream and PyAudio
stream.stop_stream()
stream.close()
audio.terminate()
Edit: I managed to get the saved model (.pb) in python with gpt’s help but its not working properly.
Modified by moderator