I have an application that uses the Gemini Live API. A web browser connects to a Go server for a live Speech-to-Speech stream.
As of yesterday, 2025/09/09, the Gemini Live API for both the gemini-live-2.5-flash-preview
and gemini-2.5-flash-preview-native-audio-dialog
models, has stopped understanding the audio input.
The application has always worked well, from July 6th until September 8th. No changes have been made to the application. But since yesterday Gemini no longer understands the audio it receives as input. At first I thought my microphone was broken, but then I checked that my PC microphone is working.
In my code the audio is saved to a file, before being sent to Gemini, and I have verified that the audio saved locally is ok. The audio that I save to file is extracted from the same payload that is then sent to Gemini in the next statement.
In the input transcription I see some text like the following: <noise> Sí. زيت
or <noise> ولما ซื้อ എന്ന് e o
The audio generated by Gemini and its output transcription are ok.
For the server code I use the Google Gen AI Go SDK
, updated to the latest commit. The Go code is the same as the example in the Google Gen AI Go SDK
Github repository.
Has anything changed on the Gemini side for its Live API? Because as mentioned above, it worked perfectly from July 6th until the day before yesterday.
This is the snippet of Go code used to send audio to Gemini:
// this var is used to save input audio on MP3 file
var builderAudioUser strings.Builder
// loop for audio input
for {
_, message, err := c.ReadMessage()
if err != nil {
if websocket.IsCloseError(err, websocket.CloseNoStatusReceived) || websocket.IsCloseError(err, websocket.CloseNormalClosure) {
logger.Warn(fmt.Sprintf("session %q: from %s - websocket connection closed: %s", sessionID, remote, err.Error()))
}
if !websocket.IsCloseError(err, websocket.CloseNoStatusReceived) && !websocket.IsCloseError(err, websocket.CloseNormalClosure) {
logger.Error(fmt.Sprintf("session %q: from %s - read from client error: %s", sessionID, remote, err.Error()))
}
}
var realtimeInput genai.LiveRealtimeInput
if err := json.Unmarshal(message, &realtimeInput); err != nil {
logger.Error(fmt.Sprintf("session %q: from %s - unmarshal message error: %s %s", sessionID, remote, string(message), err.Error()))
}
// buffer user audio chunk
builderAudioUser.Write(realtimeInput.Media.Data)
// send request to model
session.SendRealtimeInput(realtimeInput)
}
This is the browser main.js
Javascript code:
(function () {
// base network config
const protoHTTP = 'https://'
const hostAddress = 'terme.gimlab.dev:8443';
//const hostAddress = 'ff2546f72879.ngrok-free.app';
//const livePath = '/live-web-tp'; // gemini live
const livePath = '/live-native-audio-web-tp'; // gemini native audio
// audio worklet endpoint
const audioWorkletPath = '/audio-processor.js';
const audioWorkletAddress = `${protoHTTP}${hostAddress}${audioWorkletPath}`;
// config WebSocket
const protoWS = "wss://"; // secure
// websocket endpoint
let websocketURL = `${protoWS}${hostAddress}${livePath}`;
let ws;
// send and receive sample rates
let sendSampleRate = 16000;
let receiveSampleRate = 24000;
let isAudioPlaying = false; // for audio playback
let isRecording = false; // For audio recording state
// audio resources
let mediaStream = null;
let audioContextRecord = null;
let audioContextPlayback = null;
let masterGainNode = null;
let audioQueue = [];
let playbackStartTime = 0; // Tracks the start time for the next audio chunk
let chatButton;
let isChatSessionActive = false;
// CSS Class names for button and tooltips states
const START_BUTTON_CLASS = 'chat-button-start';
const STOP_BUTTON_CLASS = 'chat-button-stop';
const tooltipText = document.querySelector('#tooltip-text');
const tooltipSTART = 'Parla con un Assistente';
const tooltipEND = 'Fine';
// disable screensaver function
let wakeLock = null;
console.log(wakeLock);
async function noScreenSaver() {
try {
// request a screen wake lock
wakeLock = await navigator.wakeLock.request("screen");
console.log("Wake Lock is active!");
// listen for release screen wake lock
wakeLock.addEventListener('release', () => {
console.log('Screen Wake Lock released:', wakeLock.released);
});
} catch (err) {
// the Wake Lock request has failed - usually system related, such as battery
console.log(`${err.name}, ${err.message}`);
}
}
function createAudioContent(msg) {
data = { 'media': { 'data': msg, 'mimeType': 'audio/pcm' } };
return JSON.stringify(data);
}
function b64ToUint8Array(b64Data) {
const byteCharacters = atob(b64Data);
const byteNumbers = new Uint8Array(byteCharacters.length);
for (let i = 0; i < byteCharacters.length; i++) {
byteNumbers[i] = byteCharacters.charCodeAt(i);
}
return byteNumbers;
}
// Get the new chat toggle button with the updated ID
chatButton = document.querySelector('#chat-toggle-button'); // ID UPDATED HERE
// Set initial button style using CSS class
chatButton.classList.add(START_BUTTON_CLASS);
// On Click: Start or stop the session and immediately hide the tooltip.
chatButton.onclick = function () {
// add a class to chatButton to temporarily disable hover effects
chatButton.classList.add('no-hover');
// Hide tooltip immediately on any click
tooltipText.classList.remove('tooltip-visible');
if (!isChatSessionActive) {
// START CHAT
if (!ws || ws.readyState === WebSocket.CLOSED) {
noScreenSaver();
openWs();
}
} else {
// STOP CHAT
if (wakeLock !== null) {
wakeLock.release().then(() => {
wakeLock = null;
});
}
stopSession();
}
};
// On Mouse Enter: Show the correct tooltip based on the call state.
chatButton.onmouseenter = function() {
if (!isChatSessionActive) {
tooltipText.innerHTML = `<strong>${tooltipSTART}</strong>`;
} else {
tooltipText.innerHTML = `<strong>${tooltipEND}</strong>`;
}
tooltipText.classList.add('tooltip-visible');
};
// On Mouse Leave: Always hide the tooltip.
chatButton.onmouseleave = function() {
// remove the no-hover class from chatButton so hover effects work again
chatButton.classList.remove('no-hover');
tooltipText.classList.remove('tooltip-visible');
};
// stop the session cleanly
function stopSession() {
console.log("User requested to stop the session.");
// Close WebSocket if it's open or connecting
if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) {
ws.close();
}
// This will also trigger ws.onclose, which calls stopAllAudioAndUI()
// But we can call it directly to ensure UI updates immediately
stopAllAudioAndUI();
}
// central function to stop all audio and reset UI
function stopAllAudioAndUI() {
console.log("Stopping all audio resources and resetting UI.");
isRecording = false;
isChatSessionActive = false;
// stop microphone stream
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
mediaStream = null;
console.log("Microphone stream stopped.");
}
// close recording audio context
if (audioContextRecord && audioContextRecord.state !== 'closed') {
audioContextRecord.close().then(() => {
audioContextRecord = null;
console.log("Recording AudioContext closed.");
});
}
// stop and clear playback
stopAndClearAudio(); // This handles the playback queue
if (audioContextPlayback && audioContextPlayback.state !== 'closed') {
audioContextPlayback.close().then(() => {
audioContextPlayback = null;
console.log("Playback AudioContext closed.");
});
}
// reset UI
if (chatButton) {
chatButton.classList.remove(STOP_BUTTON_CLASS);
chatButton.classList.add(START_BUTTON_CLASS);
}
}
function openWs() {
if (ws && ws.readyState !== WebSocket.CLOSED) {
console.log("WebSocket connection already active or attempting to connect.");
return false;
}
ws = new WebSocket(websocketURL);
ws.onopen = function () {
console.log('OPEN: WebSocket Connection Established.');
initAudioContextPlayback(); // init the PCM audio player
recordStart();
if (chatButton) {
chatButton.classList.remove(START_BUTTON_CLASS);
chatButton.classList.add(STOP_BUTTON_CLASS);
}
isChatSessionActive = true;
};
ws.onclose = function () {
console.log('CLOSE: WebSocket Connection Closed.');
// ensure all resources are released on any close event
if (wakeLock !== null) {
wakeLock.release().then(() => {
wakeLock = null;
});
}
stopAllAudioAndUI();
ws = null;
};
ws.onmessage = function (evt) {
let data;
try {
data = JSON.parse(evt.data);
} catch (e) {
console.error("Error parsing message data: " + e);
return;
}
if (!data.serverContent) return;
// --- manage interruptions --- //
if (data.serverContent.interrupted) {
console.log("INTERRUPTED SIGNAL RECEIVED");
stopAndClearAudio();
return;
}
// stream playback
if (!data.serverContent.modelTurn || !data.serverContent.modelTurn.parts || !data.serverContent.modelTurn.parts[0]) return;
if (data.serverContent.modelTurn.parts[0].inlineData) {
const inlineData = data.serverContent.modelTurn.parts[0].inlineData;
if (inlineData.mimeType && inlineData.mimeType.startsWith('audio/pcm')) {
const audioData = b64ToUint8Array(inlineData.data);
// Push the raw ArrayBuffer to the queue
audioQueue.push(audioData.buffer);
// If playback is not already running, start it
if (!isAudioPlaying) {
playNextChunk();
}
}
}
};
ws.onerror = function (evt) {
console.error('WebSocket Error:', evt);
// clean up on error
stopAllAudioAndUI();
};
}
function recordStart() {
if (isRecording) return;
isRecording = true; // Set flag immediately
recordAudio();
}
async function recordAudio() {
try {
// assign stream to the higher-scoped variable
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
console.log('Microphone access granted. Initializing AudioWorklet...');
// assign context to the higher-scoped variable
audioContextRecord = new AudioContext({ sampleRate: sendSampleRate });
// Load your custom processor
//await audioContextRecord.audioWorklet.addModule('/scripts/audio-processor.js');
await audioContextRecord.audioWorklet.addModule(audioWorkletAddress);
// Create a source node from the microphone stream
const source = audioContextRecord.createMediaStreamSource(mediaStream);
// Create an instance of your AudioWorkletNode
const workletNode = new AudioWorkletNode(audioContextRecord, 'audio-processor');
// Handle messages (audio data) received from the worklet
workletNode.port.onmessage = (event) => {
if (!isRecording) return;
const pcm16Buffer = event.data; // This is an ArrayBuffer
if (ws && ws.readyState === WebSocket.OPEN) {
const base64Data = arrayBufferToBase64(pcm16Buffer);
ws.send(createAudioContent(base64Data));
}
};
// Connect the graph: microphone -> worklet
source.connect(workletNode);
console.log('AudioWorklet is running. Recording started...');
} catch (err) {
console.error('Error accessing microphone or setting up AudioWorklet: ' + err.message);
// use the central cleanup function on error
stopAllAudioAndUI();
}
}
function arrayBufferToBase64(buffer) {
let binary = '';
const bytes = new Uint8Array(buffer);
const len = bytes.byteLength;
for (let i = 0; i < len; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
function initAudioContextPlayback() {
// If the context doesn't exist or has been closed, create a new one
if (!audioContextPlayback || audioContextPlayback.state === 'closed') {
audioContextPlayback = new AudioContext({ sampleRate: receiveSampleRate });
// Create the master gain node which will act as our gate
masterGainNode = audioContextPlayback.createGain();
// Connect the gate to the final output (speakers)
masterGainNode.connect(audioContextPlayback.destination);
// Set the initial playback time to the current time of the new context
playbackStartTime = audioContextPlayback.currentTime;
}
}
function playNextChunk() {
if (audioQueue.length === 0 || !isChatSessionActive) {
isAudioPlaying = false;
return; // Stop if the queue is empty
}
isAudioPlaying = true;
// Get the next raw audio buffer from the queue
const pcmDataBuffer = audioQueue.shift();
const pcm16Data = new Int16Array(pcmDataBuffer);
// Convert the 16-bit PCM data to 32-bit Float data (which Web Audio API requires)
const pcm32fData = new Float32Array(pcm16Data.length);
for (let i = 0; i < pcm16Data.length; i++) {
pcm32fData[i] = pcm16Data[i] / 32768.0; // Normalize to range [-1.0, 1.0]
}
// Create a Web Audio Buffer
const audioBuffer = audioContextPlayback.createBuffer(
1, // Number of channels
pcm32fData.length, // Buffer length
receiveSampleRate // Sample rate
);
// Copy our data into the audio buffer
audioBuffer.copyToChannel(pcm32fData, 0);
// Create a source node to play the buffer
const sourceNode = audioContextPlayback.createBufferSource();
sourceNode.buffer = audioBuffer;
sourceNode.connect(masterGainNode);
// Schedule the playback
// If the scheduled start time is in the past, start immediately
const now = audioContextPlayback.currentTime;
if (playbackStartTime < now) {
playbackStartTime = now;
}
sourceNode.start(playbackStartTime);
// Update the start time for the *next* chunk to be the end time of *this* chunk
playbackStartTime += audioBuffer.duration;
// When this chunk finishes playing, automatically play the next one in the queue
sourceNode.onended = playNextChunk();
//sourceNode.onended = playNextChunk;
}
function stopAndClearAudio() {
console.log("INTERRUPTED: Flushing audio playback queue.");
// Ensure the context and gain node exist before proceeding
if (!audioContextPlayback || !masterGainNode) return;
// mute everything instantly by disconnecting the old gate
// all playing and scheduled sounds are now routed to a dead end
masterGainNode.disconnect();
// create a new, clean gain node for future audio
masterGainNode = audioContextPlayback.createGain();
masterGainNode.connect(audioContextPlayback.destination);
// clear the application's queue of pending audio chunks
audioQueue = [];
// reset the playback state flag
isAudioPlaying = false;
// reset the playback start time to now. This ensures the next
// audio that arrives after the interruption plays without a delay
playbackStartTime = audioContextPlayback.currentTime;
}
})();
This is the browser Javascript code for the audio worklet:
// audio-processor.js
/**
* Converts Float32 audio data to Int16 PCM.
* @param {Float32Array} float32Array The input audio data.
* @returns {Int16Array} The converted 16-bit PCM data.
*/
function convertFloat32ToInt16(float32Array) {
const int16Array = new Int16Array(float32Array.length);
for (let i = 0; i < float32Array.length; i++) {
// Clamp the value between -1 and 1 before converting
const val = Math.max(-1, Math.min(1, float32Array[i]));
// Scale to 16-bit integer range
int16Array[i] = val * 32767;
}
return int16Array;
}
class AudioProcessor extends AudioWorkletProcessor {
// The process method is called for every block of audio data.
// `inputs` is an array of inputs, each with an array of channels.
// We assume a single input and a single channel.
process(inputs, outputs, parameters) {
const input = inputs[0];
if (input.length > 0) {
const inputChannel = input[0]; // Float32Array
// Convert the audio data to 16-bit PCM.
const pcm16Data = convertFloat32ToInt16(inputChannel);
// Post the data back to the main thread.
// We transfer the buffer to avoid copying, which is more efficient.
this.port.postMessage(pcm16Data.buffer, [pcm16Data.buffer]);
}
// Return true to keep the processor alive.
return true;
}
}
registerProcessor('audio-processor', AudioProcessor);