I am wondering if gemini 2.0 and 2.5 flash have a fixed max token when uploading video. It appears I cannot get the AI to generate a very detailed description of a video. I am uploading a 30-minute soccer game, hoping it can describe everything going on to a visually impaired audience, but it appears only to output 2k token before stopping to respond. The video token is only 0.5 million, not close to the max threshold.
The endpoint is google ai studio and interfaced through google gen-ai interface
def _upload_video(self, path: str) -> types.File:
"""
Upload a video file to the Gemini API.
"""
# check if the file is already uploaded with sha
sha = file_b64_sha256(path)
for file in self.client.files.list():
if file.sha256_hash == sha and file.state == "ACTIVE":
return file
myfile = self.client.files.upload(file=path)
# wait until state became ACTIVE, max wait time should be set.
state = self.client.files.get(name=myfile.name).state
for _ in range(1000):
if state == "ACTIVE":
break
else:
state = self.client.files.get(name=myfile.name).state
time.sleep(1)
if state != "ACTIVE":
raise Exception(f"File upload failed, state: {state}")
return myfile
def _send_video_request(self, path: str, prompt: str) -> GenerateContentResponse:
"""
Send a request to the Gemini API with a video file and prompt.
Args:
path (str): Path to the video file.
prompt (str): The prompt to send to the API.
Returns:
Dict[str, Any]: The response from the API.
Raises:
Exception: If there's an error in the API request.
"""
try:
self._validate_file_path(path)
myfile = self._upload_video(path)
response:GenerateContentResponse = self.client.models.generate_content(
model=self.model,
contents=[
myfile,
prompt,
],
config=types.GenerateContentConfig(
max_output_tokens=10000,
temperature=0.5,
top_p=0.8,
)
)
return response
except Exception as e:
raise Exception(f"Error in video request: {str(e)}")
Sample response:
"...**Time:** 03:23 - 03:38\n* **Scene:** Haaland (#9) is shown standing and reacting to the collision, gesturing with his arms. The referee walks towards him.\n* **Transcription:**\n * Commentator 2: \u6885\u52aa\u548c\u54c8\u862d\u5fb7\u3002\n * Commentator 1: \u6703\u4e0d\u6703\u770bVR\u3002\n * Commentator 2: \u54ce\u5440\uff0c\u54c8\u862d\u5fb7\uff0c\u78ba\u5be6\u4f60\u8981\u8aaa\u4ed6\u767c\u63ee\u4e0d\u597d, \u4ed6\u78ba\u5be6\u767c\u63ee\u4e0d\u597d\uff0c\u4f46\u662f\u5462\uff0c\u4ed6\u53ef\u80fd\u4e5f\u6709\u59d4\u5c48\uff0c\u6211\u600e\u9ebc\u767c\u63ee\u4e0d\u662f\u62b1\u6211\u5c31\u6454\u6211\u3002\n * Commentator 1: \u55ef\u3002\n * Commentator 2: \u9019\u8db3\u7403\u5834\u6210\u4e86\u7121\u9928\u4e86\u3002\n* **Music:** None (only crowd noise and commentary)\n\n**Segment 13: Replay of Mainoo/Haaland Collision and Guardiola Reaction**\n* **Time:** 03:38 - 03:"