I’m trying to call Gemini 2.5 Pro to infer over a video in chunks. I’m uploading the video using the files API, and I explicitly cache the system prompts and tool definition.
I receive Server Errors (500) that look like this:
ServerError("500 INTERNAL. {'error': {'code': 500, 'message': 'Internal error encountered.', 'status': 'INTERNAL'}}")
Here’s how my code looks (sort of):
async def run_model(klass: GeminiRequester, file: File, start_time: int, end_time: int, cache_name: str):
video_msg = Part(
file_data=FileData(file_uri=file.uri, mime_type=file.mime_type),
video_metadata=VideoMetadata(fps=1, start_offset=f'{start_time}s', end_offset=f'{end_time}s')
)
klass.messages = [video_msg, Part.from_text(text=klass.get_prompt())]
# parses the raw response to the tool model mirror behind the scenes
response: ToolModelMirror = await klass.run(cache_name=cache_name)
return [
TaggedScene.model_validate({**tag.model_dump(), 'name': character_tagging.name})
for character_tagging in response.tagged_characters for tag in character_tagging.tagged_utterances
]
async def main(client, project, file: File, **kwargs):
system: list[str] = [get_system_prompt(**kwargs), get_metadata_prompt(**kwargs)]
cache: CachedContent = await client.aio.caches.create(
model="gemini-2.5-pro",
config=CreateCachedContentConfig(
display_name=f'{str(project.id)}-tagging',
system_instruction=system,
tools=[tool],
tool_config=ToolConfig(
function_calling_config=FunctionCallingConfig(
mode=FunctionCallingConfigMode.ANY,
allowed_function_names=[tool.function_declarations[0].name]
)
),
ttl="3000s",
)
)
chunks = get_chunks(project)
# GeminiRequester is our own abstraction
requester = GeminiRequester(project)
async with (BoundedTaskGroup(max_parallelism=5) as tg):
tasks = [
tg.create_task(
run_model(klass=requester, file=file, start_time=chunk[0], end_time=chunk[1], cache_name=cache.name)
)
for chunk in chunks
]
results = dict()
for t in tasks:
results.update(t.result())
return results
I used to have an implementation of similar code, except I was caching the whole video, and instead of specifying start/end intervals, each message used the entire cached video as input. In that implementation, the code worked with exactly the same system prompts and tool definition as this one. So, I think it might be related to slicing the video with intervals?