How to get structured output for image input

You can try the code below:

import {
    GoogleGenerativeAI,
    SchemaType,
  } from "@google/generative-ai";
import fs from "fs";
import dotenv from 'dotenv';
dotenv.config();


const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY);

// Converts local file information to base64
function fileToGenerativePart(path, mimeType) {
    return {
      inlineData: {
        data: Buffer.from(fs.readFileSync(path)).toString("base64"),
        mimeType
      },
    };
  }
  
  async function run() {
    const schema = {
        description: "List of cities",
        type: SchemaType.ARRAY,
        items: {
          type: SchemaType.OBJECT,
          properties: {
            city: {
              type: SchemaType.STRING,
              description: "Name of the city",
    
              nullable: false,
            },
          },
          required: ["city"],
        },
      };
    const model = genAI.getGenerativeModel({
        model: "gemini-1.5-pro",
        generationConfig: {
          responseMimeType: "application/json",
          responseSchema: schema,
        },
      });
  
    const prompt = "List all the cities from the given image";
  
    const imageParts = [
      fileToGenerativePart("/sample.png", "image/png")
    ];
    
  
    const generatedContent = await model.generateContent([prompt,imageParts]);
    
    console.log(generatedContent.response.text());
  }
  
  run(); 
1 Like