I’m using YOLOv8 for real-time hand detection in a web app. The model works well in Python, but after converting it to TensorFlow.js, detection struggles when the hand is too close to the webcam—sometimes missing it entirely or misplacing the bounding box.
I preprocess frames by resizing them to 640x640 and apply a Kalman filter for smoothing. The issue seems related to scale variation, but it only appears after the TensorFlow.js conversion.
Here is how i did my convertion
import os
from ultralytics import YOLO
import shutil
import tensorflow as tf
from google.colab import files as colab_files
def find_saved_model(base_path):
"""Find the SavedModel directory in the export path"""
for root, dirs, filenames in os.walk(base_path):
if 'saved_model.pb' in filenames:
return root
return None
def add_signatures(saved_model_dir):
"""Load the SavedModel and add required signatures"""
print("Adding signatures to SavedModel...")
# Load the model
model = tf.saved_model.load(saved_model_dir)
# Create a wrapper function that matches the model's interface
@tf.function(input_signature=[
tf.TensorSpec(shape=[1, 640, 640, 3], dtype=tf.float32, name='images')
])
def serving_fn(images):
# Pass False for training parameter
return model(images, False, None)
# Convert the model
concrete_func = serving_fn.get_concrete_function()
# Create a new SavedModel with the signature
tf.saved_model.save(
model,
saved_model_dir,
signatures={
'serving_default': concrete_func
}
)
print("Signatures added successfully")
return saved_model_dir
def convert_to_tfjs(pt_model_path, output_dir):
"""
Convert a PyTorch YOLO model to TensorFlow.js format
Args:
pt_model_path (str): Path to the .pt file
output_dir (str): Directory to save the converted model
"""
try:
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Load the model
print(f"Loading YOLO model from {pt_model_path}...")
model = YOLO(pt_model_path)
# First export to TensorFlow format
print("Exporting to TensorFlow format...")
success = model.export(
format='saved_model',
imgsz=640,
half=False,
simplify=True
)
# Find the SavedModel directory
saved_model_dir = find_saved_model(os.path.join(os.getcwd(), "best_saved_model"))
if not saved_model_dir:
raise Exception(f"Cannot find SavedModel directory in {os.path.dirname(pt_model_path)}")
print(f"Found SavedModel at: {saved_model_dir}")
# Add signatures to the model
saved_model_dir = add_signatures(saved_model_dir)
# Convert to TensorFlow.js
print("Converting to TensorFlow.js format...")
tfjs_target_dir = os.path.join(output_dir, 'tfjs_model')
# Ensure clean target directory
if os.path.exists(tfjs_target_dir):
shutil.rmtree(tfjs_target_dir)
os.makedirs(tfjs_target_dir)
# Try conversion with modified parameters
conversion_command = (
f"tensorflowjs_converter "
f"--input_format=tf_saved_model "
f"--output_format=tfjs_graph_model "
f"--saved_model_tags=serve "
f"--control_flow_v2=True "
f"'{saved_model_dir}' "
f"'{tfjs_target_dir}'"
)
print(f"Running conversion command: {conversion_command}")
result = os.system(conversion_command)
if result != 0:
raise Exception("TensorFlow.js conversion failed")
# Verify conversion
if not os.path.exists(os.path.join(tfjs_target_dir, 'model.json')):
raise Exception("TensorFlow.js conversion failed - model.json not found")
print(f"Successfully converted model to TensorFlow.js format")
print(f"Output saved to: {tfjs_target_dir}")
# Print model files
print("\nConverted model files:")
for filename in os.listdir(tfjs_target_dir): # Renamed 'file' to 'filename'
print(f"- {filename}")
# Create a zip file of the converted model
zip_path = f"{tfjs_target_dir}.zip"
shutil.make_archive(tfjs_target_dir, 'zip', tfjs_target_dir)
# Download the zip file using the renamed colab_files module
colab_files.download(zip_path)
except Exception as e:
print(f"Error during conversion: {str(e)}")
print("\nDebug information:")
print(f"Current working directory: {os.getcwd()}")
print(f"PT model exists: {os.path.exists(pt_model_path)}")
if 'saved_model_dir' in locals():
print(f"SavedModel directory exists: {os.path.exists(saved_model_dir)}")
if os.path.exists(saved_model_dir):
print("SavedModel contents:")
for root, dirs, filenames in os.walk(saved_model_dir): # Renamed 'files' to 'filenames'
print(f"\nDirectory: {root}")
for filename in filenames: # Renamed 'f' to 'filename'
print(f" - {filename}")
raise
# Usage
from google.colab import files as colab_files # Use consistent naming
uploaded = colab_files.upload()
pt_model_path = next(iter(uploaded.keys()))
output_dir = "converted_model"
convert_to_tfjs(pt_model_path, output_dir)
My hand pose detection web app
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Real-time Hand Pose Detection</title>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
<style>
body {
text-align: center;
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background: #f0f0f0;
}
.container {
position: relative;
width: 640px;
height: 480px;
margin: 20px auto;
}
video, canvas {
position: absolute;
left: 0;
top: 0;
}
button {
margin: 10px;
padding: 10px 20px;
font-size: 16px;
cursor: pointer;
background: #007bff;
color: white;
border: none;
border-radius: 4px;
}
button:hover {
background: #0056b3;
}
#status {
padding: 10px;
background: #fff;
border-radius: 4px;
display: inline-block;
}
</style>
</head>
<body>
<h1>Real-time Hand Pose Detection (YOLOv8)</h1>
<button onclick="loadModel()">Load Model</button>
<button onclick="startWebcam()">Start Webcam</button>
<p id="status">Model not loaded</p>
<div class="container">
<video id="video" width="640" height="480" autoplay></video>
<canvas id="canvas" width="640" height="480"></canvas>
</div>
<script type="module">
// Kalman Filter Implementation
class KalmanFilter {
constructor(stateSize, measurementSize, processNoise = 0.001, measurementNoise = 0.1) {
this.state = new Array(stateSize).fill(0); // State vector [x, y, vx, vy]
this.covariance = new Array(stateSize * stateSize).fill(0);
this.processNoise = processNoise;
this.measurementNoise = measurementNoise;
this.stateSize = stateSize;
this.measurementSize = measurementSize;
// Initialize covariance matrix with high uncertainty
for (let i = 0; i < stateSize; i++) {
this.covariance[i * stateSize + i] = 1000;
}
}
predict(dt = 1/30) {
// State transition matrix
const F = new Array(this.stateSize * this.stateSize).fill(0);
for (let i = 0; i < this.stateSize/2; i++) {
F[i * this.stateSize + i] = 1;
F[i * this.stateSize + (i + this.stateSize/2)] = dt;
F[(i + this.stateSize/2) * this.stateSize + (i + this.stateSize/2)] = 1;
}
// Predict state
const newState = new Array(this.stateSize).fill(0);
for (let i = 0; i < this.stateSize; i++) {
for (let j = 0; j < this.stateSize; j++) {
newState[i] += F[i * this.stateSize + j] * this.state[j];
}
}
this.state = newState;
// Predict covariance
const newCovariance = new Array(this.stateSize * this.stateSize).fill(0);
for (let i = 0; i < this.stateSize; i++) {
for (let j = 0; j < this.stateSize; j++) {
for (let k = 0; k < this.stateSize; k++) {
newCovariance[i * this.stateSize + j] +=
F[i * this.stateSize + k] * this.covariance[k * this.stateSize + j];
}
}
}
// Add process noise
for (let i = 0; i < this.stateSize; i++) {
newCovariance[i * this.stateSize + i] += this.processNoise;
}
this.covariance = newCovariance;
}
update(measurement) {
// Measurement matrix
const H = new Array(this.measurementSize * this.stateSize).fill(0);
for (let i = 0; i < this.measurementSize; i++) {
H[i * this.stateSize + i] = 1;
}
// Calculate Kalman gain
const S = new Array(this.measurementSize * this.measurementSize).fill(0);
for (let i = 0; i < this.measurementSize; i++) {
for (let j = 0; j < this.measurementSize; j++) {
for (let k = 0; k < this.stateSize; k++) {
S[i * this.measurementSize + j] +=
H[i * this.stateSize + k] * this.covariance[k * this.stateSize + j];
}
}
S[i * this.measurementSize + i] += this.measurementNoise;
}
const K = new Array(this.stateSize * this.measurementSize).fill(0);
for (let i = 0; i < this.stateSize; i++) {
for (let j = 0; j < this.measurementSize; j++) {
for (let k = 0; k < this.stateSize; k++) {
K[i * this.measurementSize + j] +=
this.covariance[i * this.stateSize + k] * H[j * this.stateSize + k];
}
K[i * this.measurementSize + j] /= S[j * this.measurementSize + j];
}
}
// Update state
const innovation = new Array(this.measurementSize).fill(0);
for (let i = 0; i < this.measurementSize; i++) {
innovation[i] = measurement[i];
for (let j = 0; j < this.stateSize; j++) {
innovation[i] -= H[i * this.stateSize + j] * this.state[j];
}
}
for (let i = 0; i < this.stateSize; i++) {
for (let j = 0; j < this.measurementSize; j++) {
this.state[i] += K[i * this.measurementSize + j] * innovation[j];
}
}
// Update covariance
const newCovariance = new Array(this.stateSize * this.stateSize).fill(0);
for (let i = 0; i < this.stateSize; i++) {
for (let j = 0; j < this.stateSize; j++) {
newCovariance[i * this.stateSize + j] = this.covariance[i * this.stateSize + j];
for (let k = 0; k < this.measurementSize; k++) {
newCovariance[i * this.stateSize + j] -=
K[i * this.measurementSize + k] * H[k * this.stateSize + j] * this.covariance[i * this.stateSize + j];
}
}
}
this.covariance = newCovariance;
}
getState() {
return this.state.slice(0, this.measurementSize);
}
}
let model;
let video = document.getElementById("video");
let canvas = document.getElementById("canvas");
let ctx = canvas.getContext("2d");
const CONF_THRESHOLD = 0.75;
const IOU_THRESHOLD = 0.1;
let isProcessing = false;
let previousDetections = [];
// Initialize Kalman filters
let bboxFilter = new KalmanFilter(8, 4, 0.005, 0.2); // State: [x, y, w, h, vx, vy, vw, vh]
let keypointFilter = new KalmanFilter(4, 2, 0.005, 0.2); // State: [x, y, vx, vy]
let lastFrameTime = performance.now();
// Model input size constants
const MODEL_WIDTH = 640;
const MODEL_HEIGHT = 640;
const SCALE_FACTOR = 1.8;
async function loadModel() {
try {
document.getElementById("status").innerText = "Loading model...";
model = await tf.loadGraphModel('http://localhost:8000/model.json');
document.getElementById("status").innerText = "Model loaded!";
console.log("Model loaded successfully");
} catch (error) {
console.error("Error loading model:", error);
document.getElementById("status").innerText = "Error loading model!";
}
}
async function startWebcam() {
if (!model) {
alert("Please load the model first!");
return;
}
try {
const stream = await navigator.mediaDevices.getUserMedia({
video: {
width: { ideal: 640 },
height: { ideal: 480 },
facingMode: 'user'
}
});
video.srcObject = stream;
video.onloadedmetadata = () => {
video.play();
processVideoFrame();
};
} catch (err) {
console.error("Error accessing webcam:", err);
document.getElementById("status").innerText = "Error accessing webcam!";
}
}
async function processVideoFrame() {
if (!model || !video.videoWidth || isProcessing) return;
try {
isProcessing = true;
const offscreenCanvas = document.createElement('canvas');
offscreenCanvas.width = MODEL_WIDTH;
offscreenCanvas.height = MODEL_HEIGHT;
const offscreenCtx = offscreenCanvas.getContext('2d');
const scale = Math.min(MODEL_WIDTH / video.videoWidth, MODEL_HEIGHT / video.videoHeight);
const scaledWidth = video.videoWidth * scale;
const scaledHeight = video.videoHeight * scale;
const offsetX = (MODEL_WIDTH - scaledWidth) / 2;
const offsetY = (MODEL_HEIGHT - scaledHeight) / 2;
offscreenCtx.fillStyle = 'black';
offscreenCtx.fillRect(0, 0, MODEL_WIDTH, MODEL_HEIGHT);
offscreenCtx.drawImage(video, offsetX, offsetY, scaledWidth, scaledHeight);
const imgTensor = tf.tidy(() => {
return tf.browser.fromPixels(offscreenCanvas)
.expandDims(0)
.toFloat()
.div(255.0);
});
const predictions = await model.predict(imgTensor);
imgTensor.dispose();
const processedDetections = await processDetections(predictions, {
offsetX,
offsetY,
scale,
originalWidth: video.videoWidth,
originalHeight: video.videoHeight
});
const smoothedDetections = smoothDetections(processedDetections);
drawDetections(smoothedDetections);
previousDetections = smoothedDetections;
if (Array.isArray(predictions)) {
predictions.forEach(p => p.dispose());
} else {
predictions.dispose();
}
} catch (error) {
console.error("Error in processing frame:", error);
} finally {
isProcessing = false;
requestAnimationFrame(processVideoFrame);
}
}
async function processDetections(predictionTensor, transformInfo) {
const predictions = await predictionTensor.array();
if (!predictions.length || !predictions[0].length) {
return [];
}
let detections = [];
const numDetections = predictions[0][0].length;
for (let i = 0; i < numDetections; i++) {
const confidence = predictions[0][4][i];
if (confidence > CONF_THRESHOLD) {
let x = (predictions[0][0][i] - transformInfo.offsetX) / transformInfo.scale;
let y = (predictions[0][1][i] - transformInfo.offsetY) / transformInfo.scale;
let width = (predictions[0][2][i] / transformInfo.scale) * SCALE_FACTOR;
let height = (predictions[0][3][i] / transformInfo.scale) * SCALE_FACTOR;
let kp_x = (predictions[0][5][i] - transformInfo.offsetX) / transformInfo.scale;
let kp_y = (predictions[0][6][i] - transformInfo.offsetY) / transformInfo.scale;
x = x / transformInfo.originalWidth;
y = y / transformInfo.originalHeight;
width = width / transformInfo.originalWidth;
height = height / transformInfo.originalHeight;
kp_x = kp_x / transformInfo.originalWidth;
kp_y = kp_y / transformInfo.originalHeight;
x = Math.max(0, Math.min(1, x));
y = Math.max(0, Math.min(1, y));
kp_x = Math.max(0, Math.min(1, kp_x));
kp_y = Math.max(0, Math.min(1, kp_y));
detections.push({
bbox: [x, y, width, height],
confidence,
keypoint: [kp_x, kp_y]
});
}
}
return applyNMS(detections);
}
function smoothDetections(currentDetections) {
const currentTime = performance.now();
const dt = (currentTime - lastFrameTime) / 1000; // Convert to seconds
lastFrameTime = currentTime;
return currentDetections.map(detection => {
// Predict next state
bboxFilter.predict(dt);
keypointFilter.predict(dt);
// Update with new measurements
const [x, y, width, height] = detection.bbox;
bboxFilter.update([x, y, width, height]);
const [kpX, kpY] = detection.keypoint;
keypointFilter.update([kpX, kpY]);
// Get filtered states
const filteredBbox = bboxFilter.getState();
const filteredKeypoint = keypointFilter.getState();
return {
bbox: filteredBbox,
confidence: detection.confidence,
keypoint: filteredKeypoint
};
});
}
function calculateIoU(box1, box2) {
const [x1, y1, w1, h1] = box1;
const [x2, y2, w2, h2] = box2;
const x1min = x1 - w1/2;
const x1max = x1 + w1/2;
const y1min = y1 - h1/2;
const y1max = y1 + h1/2;
const x2min = x2 - w2/2;
const x2max = x2 + w2/2;
const y2min = y2 - h2/2;
const y2max = y2 + h2/2;
const xOverlap = Math.max(0, Math.min(x1max, x2max) - Math.max(x1min, x2min));
const yOverlap = Math.max(0, Math.min(y1max, y2max) - Math.max(y1min, y2min));
const intersectionArea = xOverlap * yOverlap;
const union = w1 * h1 + w2 * h2 - intersectionArea;
return intersectionArea / union;
}
async function applyNMS(detections) {
detections.sort((a, b) => b.confidence - a.confidence);
const selected = [];
const active = new Set(Array(detections.length).keys());
for (let i = 0; i < detections.length; i++) {
if (!active.has(i)) continue;
selected.push(detections[i]);
for (let j = i + 1; j < detections.length; j++) {
if (!active.has(j)) continue;
const iou = calculateIoU(detections[i].bbox, detections[j].bbox);
if (iou >= IOU_THRESHOLD) active.delete(j);
}
}
return selected;
}
function drawDetections(detections) {
ctx.clearRect(0, 0, canvas.width, canvas.height);
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
detections.forEach(detection => {
const [x, y, width, height] = detection.bbox;
const [keypointX, keypointY] = detection.keypoint;
// Convert normalized coordinates to pixel values
const boxX = (x - width/2) * canvas.width;
const boxY = (y - height/2) * canvas.height;
const boxWidth = width * canvas.width;
const boxHeight = height * canvas.height;
// Draw bounding box
ctx.strokeStyle = 'red';
ctx.lineWidth = 2;
ctx.strokeRect(boxX, boxY, boxWidth, boxHeight);
// Draw keypoint
const kpX = keypointX * canvas.width;
const kpY = keypointY * canvas.height;
ctx.fillStyle = 'blue';
ctx.beginPath();
ctx.arc(kpX, kpY, 5, 0, 2 * Math.PI);
ctx.fill();
// Draw confidence score
ctx.fillStyle = 'red';
ctx.font = '14px Arial';
ctx.fillText(`Conf: ${detection.confidence.toFixed(2)}`, boxX, boxY - 5);
});
}
window.loadModel = loadModel;
window.startWebcam = startWebcam;
</script>
</body>
</html>
Something i tried was adjusting bounding box scaling, tuning IoU and confidence thresholds.