Having trouble prompt engineering using Image, Prompt, CoCo

For some reason when I try to pass a coco annotation file to this endpoint it fails. Can someone please help me with this issue? Here’s my test script:

import base64
import json
import os
import requests

def generate_content(api_key, image_path, coco_file_path, prompt):
    """
    Sends an image (Base64-encoded), a COCO file (as serialized JSON text), and a text prompt to the Google Gemini API.

    Args:
        api_key (str): Your Google API key.
        image_path (str): Path to the image file.
        coco_file_path (str): Path to the COCO annotation JSON file.
        prompt (str): The text prompt.

    Returns:
        dict: The API response.
    """
    try:
        # Read and encode the image as Base64
        with open(image_path, "rb") as image_file:
            encoded_image = base64.b64encode(image_file.read()).decode("utf-8")

        # Load the COCO JSON file and serialize as a plain text string
        with open(coco_file_path, "r") as coco_file:
            coco_data = json.load(coco_file)
        serialized_coco_data = json.dumps(coco_data)  # Serialize as JSON string

        # Create the JSON payload
        payload = {
            "contents": [{
                "parts": [
                    {"text": prompt},
                    {
                        "inline_data": {
                            "mime_type": "image/jpeg",
                            "data": encoded_image
                        }
                    },
                    {
                        "inline_data": {
                            "mime_type": "text/plain",  # Use text/plain for serialized COCO JSON
                            "data": serialized_coco_data  # Include serialized COCO data as text
                        }
                    }
                ]
            }]
        }

        # API URL
        api_url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent"

        # Send the request
        headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
        response = requests.post(api_url, headers=headers, json=payload)

        # Handle the response
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return None

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example Usage
api_key = os.getenv("GOOGLE_API_KEY")  # Use environment variable for security
coco_path = "/general/STest/eeaaf981-12d7-4de5-9f05-f3480f4b5301.coco_annotations.json"
image_path = "/general/STest/eeaaf981-12d7-4de5-9f05-f3480f4b5301.rgb_0000.png"
prompt = "Please describe this image using annotations."

result = generate_content(api_key, image_path, coco_path, prompt)
if result:
    print("Response:", json.dumps(result, indent=2))

Welcome to the forum.

I suspect your serialized_coco_data gets extra escape characters when you json.dumps(coco_data), making the overall payload invalid JSON. You can check by first outputting the payload locally and then testing whether a JSON deserializer can handle it.

Hope that helps.

1 Like