I am trying to determine how to properly calculate pricing for prompts containing an image with Gemini 2.0 Flash. I am using the count_tokens api before sending a request to the generate_content endpoint and seeing very different numbers in that response from what I get in the metadata for the actual generate content response. For reference, in the example below, I have a system prompt of around 100-150 tokens put an image in the user prompt which is 1980x1380 px. I estimate output tokens at a 1:4 ratio based on my usage data.
I pass the same constructed prompt (system_prompt, user_prompt, Base64 ImageData) to the count_tokens endpoint and then to the generate_content endpoint using the same model. This is the output I get:
ESTIMATED - Input tokens: 415, Output tokens: 103, Cost: 0.0001136
2025-03-03 10:22:39,789 [INFO] app.routers.records: Record xxx updated with new original_text.
2025-03-03 10:22:39,790 [INFO] app.routers.ai: Transcribe operation completed. Input tokens: 2085, Output tokens: 405
Cost: 0.000492
For reference, here is the relevant code:
async def count_input_tokens(self, engine: str, system_prompt: str, user_prompt: str, image_data: Optional[Any]) -> int:
"""
Count input tokens for text and images using prepared content.
Args:
engine (str): Model engine to use
system_prompt (str): System prompt to guide the AI
user_prompt (str): User prompt/question
image_data (Optional[Any]): Image data for multimodal requests
Returns:
int: Total number of input tokens
"""
try:
# Reuse our existing content preparation logic
_, content = await self._prepare_content(system_prompt, user_prompt, image_data)
# Initialize model
model = genai.GenerativeModel(model_name=engine)
# Count tokens for the entire content
token_result = await asyncio.to_thread(model.count_tokens, content)
total_tokens = getattr(token_result, 'total_tokens', 0)
logger.debug(f"Total token count for content: {total_tokens}")
return total_tokens
except Exception as e:
logger.error(f"Error in count_input_tokens: {e}")
return 0
async def _prepare_content(
self,
system_prompt: str,
user_prompt: str,
image_data: Optional[Any]
) -> Tuple[str, List[Any]]:
"""
Prepare content for Gemini API.
Args:
system_prompt (str): System prompt to guide the AI
user_prompt (str): User prompt/question
image_data (Optional[Any]): Image data for multimodal requests
Returns:
Tuple[str, List[Any]]: Tuple containing system prompt and content list
"""
content = []
if system_prompt:
content.append(system_prompt)
# Add user prompt if provided
if user_prompt:
content.append(user_prompt)
if image_data:
# Normalize image data to base64 format
normalized_data = normalize_image_input(image_data)
if normalized_data:
# Handle single image case (string)
if isinstance(normalized_data, str):
image = base64_to_image(normalized_data)
if image:
content.append(image)
# Handle multiple images case (list of tuples)
elif isinstance(normalized_data, list):
for base64_data, label in normalized_data:
if label:
content.append(label)
image = base64_to_image(base64_data)
if image:
content.append(image)
return system_prompt, content
async def _generate_response(
self,
engine: str,
prepared_content: Tuple[str, List[Any]],
temp: float
) -> Any:
"""
Generate response from Gemini.
Args:
engine (str): Model engine to use (e.g., 'gemini-pro', 'gemini-pro-vision')
prepared_content (Tuple[str, List[Any]]): Tuple of system prompt and content list
temp (float): Temperature parameter
Returns:
Any: Raw API response object
"""
system_prompt, content = prepared_content
# Configure generation parameters
generation_config = {
"temperature": temp,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
}
# Initialize the model
model = genai.GenerativeModel(
model_name=engine,
generation_config=generation_config,
system_instruction=system_prompt
)
# Set safety settings - using non-blocking settings to match the existing approach
safety_settings = [
{
"category": HarmCategory.HARM_CATEGORY_HARASSMENT,
"threshold": "BLOCK_NONE",
},
{
"category": HarmCategory.HARM_CATEGORY_HATE_SPEECH,
"threshold": "BLOCK_NONE",
},
{
"category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
"threshold": "BLOCK_NONE",
},
{
"category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
"threshold": "BLOCK_NONE",
}
]
# Create and track the task for potential cancellation
task = asyncio.create_task(
asyncio.to_thread(
model.generate_content,
contents=content,
safety_settings=safety_settings
)
)
self._tasks.add(task)
try:
# Wait for the task to complete
response = await task
logger.debug(f"Gemini API Raw Response: {response}")
# Store the response for token counting
self._last_response = response
return response
finally:
# Clean up the task reference
self._tasks.remove(task)