YOLOv8 Hand Detection Fails at Close Range After TensorFlow.js Conversion

I’m using YOLOv8 for real-time hand detection in a web app. The model works well in Python, but after converting it to TensorFlow.js, detection struggles when the hand is too close to the webcam—sometimes missing it entirely or misplacing the bounding box.

I preprocess frames by resizing them to 640x640 and apply a Kalman filter for smoothing. The issue seems related to scale variation, but it only appears after the TensorFlow.js conversion.

Here is how i did my convertion

import os
from ultralytics import YOLO
import shutil
import tensorflow as tf
from google.colab import files as colab_files

def find_saved_model(base_path):
    """Find the SavedModel directory in the export path"""
    for root, dirs, filenames in os.walk(base_path):
        if 'saved_model.pb' in filenames:
            return root
    return None

def add_signatures(saved_model_dir):
    """Load the SavedModel and add required signatures"""
    print("Adding signatures to SavedModel...")

    # Load the model
    model = tf.saved_model.load(saved_model_dir)

    # Create a wrapper function that matches the model's interface
    @tf.function(input_signature=[
        tf.TensorSpec(shape=[1, 640, 640, 3], dtype=tf.float32, name='images')
    ])
    def serving_fn(images):
        # Pass False for training parameter
        return model(images, False, None)

    # Convert the model
    concrete_func = serving_fn.get_concrete_function()

    # Create a new SavedModel with the signature
    tf.saved_model.save(
        model,
        saved_model_dir,
        signatures={
            'serving_default': concrete_func
        }
    )

    print("Signatures added successfully")
    return saved_model_dir

def convert_to_tfjs(pt_model_path, output_dir):
    """
    Convert a PyTorch YOLO model to TensorFlow.js format

    Args:
        pt_model_path (str): Path to the .pt file
        output_dir (str): Directory to save the converted model
    """
    try:
        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)

        # Load the model
        print(f"Loading YOLO model from {pt_model_path}...")
        model = YOLO(pt_model_path)

        # First export to TensorFlow format
        print("Exporting to TensorFlow format...")


        success = model.export(
            format='saved_model',
            imgsz=640,
            half=False,
            simplify=True
        )

        # Find the SavedModel directory
        saved_model_dir = find_saved_model(os.path.join(os.getcwd(), "best_saved_model"))
        if not saved_model_dir:
            raise Exception(f"Cannot find SavedModel directory in {os.path.dirname(pt_model_path)}")

        print(f"Found SavedModel at: {saved_model_dir}")

        # Add signatures to the model
        saved_model_dir = add_signatures(saved_model_dir)

        # Convert to TensorFlow.js
        print("Converting to TensorFlow.js format...")
        tfjs_target_dir = os.path.join(output_dir, 'tfjs_model')

        # Ensure clean target directory
        if os.path.exists(tfjs_target_dir):
            shutil.rmtree(tfjs_target_dir)
        os.makedirs(tfjs_target_dir)

        # Try conversion with modified parameters
        conversion_command = (
            f"tensorflowjs_converter "
            f"--input_format=tf_saved_model "
            f"--output_format=tfjs_graph_model "
            f"--saved_model_tags=serve "
            f"--control_flow_v2=True "
            f"'{saved_model_dir}' "
            f"'{tfjs_target_dir}'"
        )

        print(f"Running conversion command: {conversion_command}")
        result = os.system(conversion_command)

        if result != 0:
            raise Exception("TensorFlow.js conversion failed")

        # Verify conversion
        if not os.path.exists(os.path.join(tfjs_target_dir, 'model.json')):
            raise Exception("TensorFlow.js conversion failed - model.json not found")

        print(f"Successfully converted model to TensorFlow.js format")
        print(f"Output saved to: {tfjs_target_dir}")

        # Print model files
        print("\nConverted model files:")
        for filename in os.listdir(tfjs_target_dir):  # Renamed 'file' to 'filename'
            print(f"- {filename}")

        # Create a zip file of the converted model
        zip_path = f"{tfjs_target_dir}.zip"
        shutil.make_archive(tfjs_target_dir, 'zip', tfjs_target_dir)

        # Download the zip file using the renamed colab_files module
        colab_files.download(zip_path)

    except Exception as e:
        print(f"Error during conversion: {str(e)}")
        print("\nDebug information:")
        print(f"Current working directory: {os.getcwd()}")
        print(f"PT model exists: {os.path.exists(pt_model_path)}")
        if 'saved_model_dir' in locals():
            print(f"SavedModel directory exists: {os.path.exists(saved_model_dir)}")
            if os.path.exists(saved_model_dir):
                print("SavedModel contents:")
                for root, dirs, filenames in os.walk(saved_model_dir):  # Renamed 'files' to 'filenames'
                    print(f"\nDirectory: {root}")
                    for filename in filenames:  # Renamed 'f' to 'filename'
                        print(f"  - {filename}")
        raise

# Usage
from google.colab import files as colab_files  # Use consistent naming
uploaded = colab_files.upload()
pt_model_path = next(iter(uploaded.keys()))
output_dir = "converted_model"
convert_to_tfjs(pt_model_path, output_dir)

My hand pose detection web app

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Real-time Hand Pose Detection</title>
    <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
    <style>
        body { 
            text-align: center; 
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 20px;
            background: #f0f0f0;
        }
        .container {
            position: relative;
            width: 640px;
            height: 480px;
            margin: 20px auto;
        }
        video, canvas { 
            position: absolute;
            left: 0;
            top: 0;
        }
        button {
            margin: 10px;
            padding: 10px 20px;
            font-size: 16px;
            cursor: pointer;
            background: #007bff;
            color: white;
            border: none;
            border-radius: 4px;
        }
        button:hover {
            background: #0056b3;
        }
        #status {
            padding: 10px;
            background: #fff;
            border-radius: 4px;
            display: inline-block;
        }
    </style>
</head>
<body>
    <h1>Real-time Hand Pose Detection (YOLOv8)</h1>
    <button onclick="loadModel()">Load Model</button>
    <button onclick="startWebcam()">Start Webcam</button>
    <p id="status">Model not loaded</p>

    <div class="container">
        <video id="video" width="640" height="480" autoplay></video>
        <canvas id="canvas" width="640" height="480"></canvas>
    </div>

    <script type="module">
        // Kalman Filter Implementation
        class KalmanFilter {
            constructor(stateSize, measurementSize, processNoise = 0.001, measurementNoise = 0.1) {
                this.state = new Array(stateSize).fill(0);         // State vector [x, y, vx, vy]
                this.covariance = new Array(stateSize * stateSize).fill(0);
                this.processNoise = processNoise;
                this.measurementNoise = measurementNoise;
                this.stateSize = stateSize;
                this.measurementSize = measurementSize;

                // Initialize covariance matrix with high uncertainty
                for (let i = 0; i < stateSize; i++) {
                    this.covariance[i * stateSize + i] = 1000;
                }
            }

            predict(dt = 1/30) {
                // State transition matrix
                const F = new Array(this.stateSize * this.stateSize).fill(0);
                for (let i = 0; i < this.stateSize/2; i++) {
                    F[i * this.stateSize + i] = 1;
                    F[i * this.stateSize + (i + this.stateSize/2)] = dt;
                    F[(i + this.stateSize/2) * this.stateSize + (i + this.stateSize/2)] = 1;
                }

                // Predict state
                const newState = new Array(this.stateSize).fill(0);
                for (let i = 0; i < this.stateSize; i++) {
                    for (let j = 0; j < this.stateSize; j++) {
                        newState[i] += F[i * this.stateSize + j] * this.state[j];
                    }
                }
                this.state = newState;

                // Predict covariance
                const newCovariance = new Array(this.stateSize * this.stateSize).fill(0);
                for (let i = 0; i < this.stateSize; i++) {
                    for (let j = 0; j < this.stateSize; j++) {
                        for (let k = 0; k < this.stateSize; k++) {
                            newCovariance[i * this.stateSize + j] += 
                                F[i * this.stateSize + k] * this.covariance[k * this.stateSize + j];
                        }
                    }
                }

                // Add process noise
                for (let i = 0; i < this.stateSize; i++) {
                    newCovariance[i * this.stateSize + i] += this.processNoise;
                }

                this.covariance = newCovariance;
            }

            update(measurement) {
                // Measurement matrix
                const H = new Array(this.measurementSize * this.stateSize).fill(0);
                for (let i = 0; i < this.measurementSize; i++) {
                    H[i * this.stateSize + i] = 1;
                }

                // Calculate Kalman gain
                const S = new Array(this.measurementSize * this.measurementSize).fill(0);
                for (let i = 0; i < this.measurementSize; i++) {
                    for (let j = 0; j < this.measurementSize; j++) {
                        for (let k = 0; k < this.stateSize; k++) {
                            S[i * this.measurementSize + j] += 
                                H[i * this.stateSize + k] * this.covariance[k * this.stateSize + j];
                        }
                    }
                    S[i * this.measurementSize + i] += this.measurementNoise;
                }

                const K = new Array(this.stateSize * this.measurementSize).fill(0);
                for (let i = 0; i < this.stateSize; i++) {
                    for (let j = 0; j < this.measurementSize; j++) {
                        for (let k = 0; k < this.stateSize; k++) {
                            K[i * this.measurementSize + j] += 
                                this.covariance[i * this.stateSize + k] * H[j * this.stateSize + k];
                        }
                        K[i * this.measurementSize + j] /= S[j * this.measurementSize + j];
                    }
                }

                // Update state
                const innovation = new Array(this.measurementSize).fill(0);
                for (let i = 0; i < this.measurementSize; i++) {
                    innovation[i] = measurement[i];
                    for (let j = 0; j < this.stateSize; j++) {
                        innovation[i] -= H[i * this.stateSize + j] * this.state[j];
                    }
                }

                for (let i = 0; i < this.stateSize; i++) {
                    for (let j = 0; j < this.measurementSize; j++) {
                        this.state[i] += K[i * this.measurementSize + j] * innovation[j];
                    }
                }

                // Update covariance
                const newCovariance = new Array(this.stateSize * this.stateSize).fill(0);
                for (let i = 0; i < this.stateSize; i++) {
                    for (let j = 0; j < this.stateSize; j++) {
                        newCovariance[i * this.stateSize + j] = this.covariance[i * this.stateSize + j];
                        for (let k = 0; k < this.measurementSize; k++) {
                            newCovariance[i * this.stateSize + j] -= 
                                K[i * this.measurementSize + k] * H[k * this.stateSize + j] * this.covariance[i * this.stateSize + j];
                        }
                    }
                }
                this.covariance = newCovariance;
            }

            getState() {
                return this.state.slice(0, this.measurementSize);
            }
        }

        let model;
        let video = document.getElementById("video");
        let canvas = document.getElementById("canvas");
        let ctx = canvas.getContext("2d");

        const CONF_THRESHOLD = 0.75;
        const IOU_THRESHOLD = 0.1;
        let isProcessing = false;
        let previousDetections = [];

        // Initialize Kalman filters
        let bboxFilter = new KalmanFilter(8, 4, 0.005, 0.2); // State: [x, y, w, h, vx, vy, vw, vh]
        let keypointFilter = new KalmanFilter(4, 2, 0.005, 0.2); // State: [x, y, vx, vy]
        let lastFrameTime = performance.now();

        // Model input size constants
        const MODEL_WIDTH = 640;
        const MODEL_HEIGHT = 640;
        const SCALE_FACTOR = 1.8;

        async function loadModel() {
            try {
                document.getElementById("status").innerText = "Loading model...";
                model = await tf.loadGraphModel('http://localhost:8000/model.json');
                document.getElementById("status").innerText = "Model loaded!";
                console.log("Model loaded successfully");
            } catch (error) {
                console.error("Error loading model:", error);
                document.getElementById("status").innerText = "Error loading model!";
            }
        }

        async function startWebcam() {
            if (!model) {
                alert("Please load the model first!");
                return;
            }

            try {
                const stream = await navigator.mediaDevices.getUserMedia({ 
                    video: { 
                        width: { ideal: 640 },
                        height: { ideal: 480 },
                        facingMode: 'user'
                    } 
                });
                video.srcObject = stream;
                video.onloadedmetadata = () => {
                    video.play();
                    processVideoFrame();
                };
            } catch (err) {
                console.error("Error accessing webcam:", err);
                document.getElementById("status").innerText = "Error accessing webcam!";
            }
        }

        async function processVideoFrame() {
            if (!model || !video.videoWidth || isProcessing) return;
            
            try {
                isProcessing = true;
                
                const offscreenCanvas = document.createElement('canvas');
                offscreenCanvas.width = MODEL_WIDTH;
                offscreenCanvas.height = MODEL_HEIGHT;
                const offscreenCtx = offscreenCanvas.getContext('2d');
                
                const scale = Math.min(MODEL_WIDTH / video.videoWidth, MODEL_HEIGHT / video.videoHeight);
                const scaledWidth = video.videoWidth * scale;
                const scaledHeight = video.videoHeight * scale;
                const offsetX = (MODEL_WIDTH - scaledWidth) / 2;
                const offsetY = (MODEL_HEIGHT - scaledHeight) / 2;
                
                offscreenCtx.fillStyle = 'black';
                offscreenCtx.fillRect(0, 0, MODEL_WIDTH, MODEL_HEIGHT);
                offscreenCtx.drawImage(video, offsetX, offsetY, scaledWidth, scaledHeight);
                
                const imgTensor = tf.tidy(() => {
                    return tf.browser.fromPixels(offscreenCanvas)
                        .expandDims(0)
                        .toFloat()
                        .div(255.0);
                });
        
                const predictions = await model.predict(imgTensor);
                imgTensor.dispose();
                
                const processedDetections = await processDetections(predictions, {
                    offsetX,
                    offsetY,
                    scale,
                    originalWidth: video.videoWidth,
                    originalHeight: video.videoHeight
                });
                
                const smoothedDetections = smoothDetections(processedDetections);
                drawDetections(smoothedDetections);
                
                previousDetections = smoothedDetections;
                
                if (Array.isArray(predictions)) {
                    predictions.forEach(p => p.dispose());
                } else {
                    predictions.dispose();
                }
                
            } catch (error) {
                console.error("Error in processing frame:", error);
            } finally {
                isProcessing = false;
                requestAnimationFrame(processVideoFrame);
            }
        }

        async function processDetections(predictionTensor, transformInfo) {
            const predictions = await predictionTensor.array();
            
            if (!predictions.length || !predictions[0].length) {
                return [];
            }
            
            let detections = [];
            const numDetections = predictions[0][0].length;
            
            for (let i = 0; i < numDetections; i++) {
                const confidence = predictions[0][4][i];
                
                if (confidence > CONF_THRESHOLD) {
                    let x = (predictions[0][0][i] - transformInfo.offsetX) / transformInfo.scale;
                    let y = (predictions[0][1][i] - transformInfo.offsetY) / transformInfo.scale;
                    let width = (predictions[0][2][i] / transformInfo.scale) * SCALE_FACTOR;
                    let height = (predictions[0][3][i] / transformInfo.scale) * SCALE_FACTOR;
                    
                    let kp_x = (predictions[0][5][i] - transformInfo.offsetX) / transformInfo.scale;
                    let kp_y = (predictions[0][6][i] - transformInfo.offsetY) / transformInfo.scale;
                    
                    x = x / transformInfo.originalWidth;
                    y = y / transformInfo.originalHeight;
                    width = width / transformInfo.originalWidth;
                    height = height / transformInfo.originalHeight;
                    kp_x = kp_x / transformInfo.originalWidth;
                    kp_y = kp_y / transformInfo.originalHeight;
                    
                    x = Math.max(0, Math.min(1, x));
                    y = Math.max(0, Math.min(1, y));
                    kp_x = Math.max(0, Math.min(1, kp_x));
                    kp_y = Math.max(0, Math.min(1, kp_y));
                    
                    detections.push({
                        bbox: [x, y, width, height],
                        confidence,
                        keypoint: [kp_x, kp_y]
                    });
                }
            }
            
            return applyNMS(detections);
        }

        function smoothDetections(currentDetections) {
            const currentTime = performance.now();
            const dt = (currentTime - lastFrameTime) / 1000; // Convert to seconds
            lastFrameTime = currentTime;

            return currentDetections.map(detection => {
                // Predict next state
                bboxFilter.predict(dt);
                keypointFilter.predict(dt);

                // Update with new measurements
                const [x, y, width, height] = detection.bbox;
                bboxFilter.update([x, y, width, height]);

                const [kpX, kpY] = detection.keypoint;
                keypointFilter.update([kpX, kpY]);

                // Get filtered states
                const filteredBbox = bboxFilter.getState();
                const filteredKeypoint = keypointFilter.getState();

                return {
                    bbox: filteredBbox,
                    confidence: detection.confidence,
                    keypoint: filteredKeypoint
                };
            });
        }

        function calculateIoU(box1, box2) {
            const [x1, y1, w1, h1] = box1;
            const [x2, y2, w2, h2] = box2;
            
            const x1min = x1 - w1/2;
            const x1max = x1 + w1/2;
            const y1min = y1 - h1/2;
            const y1max = y1 + h1/2;
            
            const x2min = x2 - w2/2;
            const x2max = x2 + w2/2;
            const y2min = y2 - h2/2;
            const y2max = y2 + h2/2;
            
            const xOverlap = Math.max(0, Math.min(x1max, x2max) - Math.max(x1min, x2min));
            const yOverlap = Math.max(0, Math.min(y1max, y2max) - Math.max(y1min, y2min));
            
            const intersectionArea = xOverlap * yOverlap;
            const union = w1 * h1 + w2 * h2 - intersectionArea;
            
            return intersectionArea / union;
        }

        async function applyNMS(detections) {
            detections.sort((a, b) => b.confidence - a.confidence);
            
            const selected = [];
            const active = new Set(Array(detections.length).keys());
            
            for (let i = 0; i < detections.length; i++) {
                if (!active.has(i)) continue;
                
                selected.push(detections[i]);
                
                for (let j = i + 1; j < detections.length; j++) {
                    if (!active.has(j)) continue;
                    
                    const iou = calculateIoU(detections[i].bbox, detections[j].bbox);
                    if (iou >= IOU_THRESHOLD) active.delete(j);
                }
            }
            
            return selected;
        }

        function drawDetections(detections) {
            ctx.clearRect(0, 0, canvas.width, canvas.height);
            ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
            
            detections.forEach(detection => {
                const [x, y, width, height] = detection.bbox;
                const [keypointX, keypointY] = detection.keypoint;
                
                // Convert normalized coordinates to pixel values
                const boxX = (x - width/2) * canvas.width;
                const boxY = (y - height/2) * canvas.height;
                const boxWidth = width * canvas.width;
                const boxHeight = height * canvas.height;
                
                // Draw bounding box
                ctx.strokeStyle = 'red';
                ctx.lineWidth = 2;
                ctx.strokeRect(boxX, boxY, boxWidth, boxHeight);
                
                // Draw keypoint
                const kpX = keypointX * canvas.width;
                const kpY = keypointY * canvas.height;
                
                ctx.fillStyle = 'blue';
                ctx.beginPath();
                ctx.arc(kpX, kpY, 5, 0, 2 * Math.PI);
                ctx.fill();
                
                // Draw confidence score
                ctx.fillStyle = 'red';
                ctx.font = '14px Arial';
                ctx.fillText(`Conf: ${detection.confidence.toFixed(2)}`, boxX, boxY - 5);
            });
        }

        window.loadModel = loadModel;
        window.startWebcam = startWebcam;
    </script>
</body>
</html>

Something i tried was adjusting bounding box scaling, tuning IoU and confidence thresholds.