Hi Sonali_Kumari1! Thanks for the follow-up and the warm welcome, really appreciate it..
Below is a minimal reproducible example.
This is not my production code, but a simplified version that isolates the issue.
Even with this basic configuration alone, in a clean script, the input_audio_transcription events start showing delay (or sometimes stop arriving) during long continuous user speech (≈20–40 seconds).
I also want to mention that the same delay behavior occurs when using the official Google Workshop example, without any custom logic, which suggests it may not be related to my application code.
Here is the full minimal example:
import asyncio
from google import genai
from google.genai import types
from google.genai.live import LiveClient
# ==== Constants used in my real project (inlined here for the minimal example) ====
GOOGLE_GEMINI_SILENCE_THRESHOLD_MS = 900 # Silence threshold in ms for Gemini
GOOGLE_GEMINI_PREFIX_PADDING_MS = 300 # Prefix padding in ms for Gemini
GOOGLE_GEMINI_SESSION_RESUMPTION_TIMEOUT = 6600 # Resumption tokens valid time (not used in this minimal script)
GEMINI_CUSTOM_WORD_REPLACEMENTS_ENABLED = True # Enable custom word replacements for Gemini (not used in this minimal script)
GOOGLE_GEMINI_USE_REPORT_SIGNAL_TOOL = True # Use ReportSignal tool for better speech detection (not used in this minimal script)
GOOGLE_GEMINI_CONTEXT_WINDOW_TRIGGER_TOKENS = 25600 # Trigger tokens for context window compression
GOOGLE_GEMINI_CONTEXT_WINDOW_TARGET_TOKENS = 12800 # Target tokens for sliding window after compression
# ==== Live config used to reproduce the issue ====
live_config = types.LiveConnectConfigDict(
response_modalities=[types.Modality.AUDIO],
speech_config=types.SpeechConfigDict(
voice_config=types.VoiceConfigDict(
prebuilt_voice_config={
"voice_name": "Aoede",
}
),
language_code="en-US",
),
# I keep these dicts empty as in my real code – the delay / missing
# input_audio_transcription events still happen in this configuration.
input_audio_transcription={},
output_audio_transcription={},
thinking_config=types.ThinkingConfigDict(
include_thoughts=False,
thinking_budget=0,
),
realtime_input_config=types.RealtimeInputConfigDict(
automatic_activity_detection=types.AutomaticActivityDetectionDict(
disabled=False,
start_of_speech_sensitivity=types.StartSensitivity.START_SENSITIVITY_HIGH,
end_of_speech_sensitivity=types.EndSensitivity.END_SENSITIVITY_LOW,
silence_duration_ms=GOOGLE_GEMINI_SILENCE_THRESHOLD_MS,
prefix_padding_ms=GOOGLE_GEMINI_PREFIX_PADDING_MS,
)
),
# In my real app I use session resumption; for this minimal repro I keep handle=None
session_resumption=types.SessionResumptionConfigDict(
handle=None,
),
context_window_compression=types.ContextWindowCompressionConfigDict(
trigger_tokens=GOOGLE_GEMINI_CONTEXT_WINDOW_TRIGGER_TOKENS,
sliding_window=types.SlidingWindowDict(
target_tokens=GOOGLE_GEMINI_CONTEXT_WINDOW_TARGET_TOKENS,
),
),
system_instruction="You are a helpful assistant.",
# In the real project I may use tools (e.g. word replacement, report signal tool),
# but for this minimal example I leave them empty to better isolate the issue.
tools=[
{
"function_declarations": []
}
],
)
# ==== Minimal script to reproduce transcription delay issue ====
async def main():
client = genai.Client() # or LiveClient(), depending on your setup
# Start a Live session with the config above
session = await LiveClient().start_session(
model="gemini-2.5-flash-native-audio-preview-02-05",
config=live_config,
)
@session.events.on_input_audio_transcription_added
def on_transcription(event):
print("TRANSCRIPTION:", event.transcription, flush=True)
# Simulate a long user utterance by streaming a raw audio file
# (16kHz mono, e.g. 20–40 seconds of continuous speech)
with open("long_speech_sample.raw", "rb") as f:
while chunk := f.read(4000):
await session.send(input_audio=chunk)
# Give some time for late transcription events to arrive
await asyncio.sleep(5)
await session.stop()
if __name__ == "__main__":
asyncio.run(main())