I had written the following code:
“”"
=== Excel Text Extraction Test ===
Tests Google Gemini 2.5 Pro’s ability to extract text from multi-sheet Excel files
“”"
import os
from dotenv import load_dotenv
from google import genai
from google.genai import types
load_dotenv()
model = os.getenv(‘MODEL_NAME’)
temperature = os.getenv(‘TEMPERATURE’)
# Initialize the client with API key
client = genai.Client()
def upload_excel_file(file_path: str) → str:
“”"
Uploads an Excel file to Gemini API using the Files API
Args:
file_path: Path to the Excel file
Returns:
file: Uploaded file
“”"
try:
# Upload the file
uploaded_file = client.files.upload(
file=file_path,
config=dict(
# mime_type=“application/vnd.openxmlformats-officedocument.spreadsheetml.sheet”
mime_type=“text/plain”
)
)
print(f" File uploaded successfully")
print(f" Name: {uploaded_file.name}")
print(f" MIME Type: {uploaded_file.mime_type}")
print(f" URI: {uploaded_file.uri}")
return uploaded_file
except Exception as e:
print(f"Error uploading file: {str(e)}")
raise
def extract_text_from_excel(file: str, system_instruction: str, prompt: str) → dict:
“”"
Extracts text from Excel file using Gemini 2.5 Pro
Args:
file: Uploaded file
system_instruction: System instruction for the prompt
prompt: Custom prompt for extraction
Returns:
dict: Extracted text and metadata
“”"
try:
# Create the request with the file
response = client.models.generate_content(
model=model,
contents=[
file,
prompt
],
config=types.GenerateContentConfig(
system_instruction=system_instruction,
temperature=temperature,
# max_output_tokens=8192,
safety_settings= [
types.SafetySetting(
category=‘HARM_CATEGORY_HATE_SPEECH’,
threshold=‘BLOCK_ONLY_HIGH’
),
]
)
)
extracted_text = response.text
return {
‘success’: True,
‘extracted_text’: extracted_text,
‘model_used’: model,
‘tokens_used’: {
‘prompt’: response.usage_metadata.prompt_token_count if hasattr(response, ‘usage_metadata’) else None,
‘completion’: response.usage_metadata.candidates_token_count if hasattr(response, ‘usage_metadata’) else None,
‘total’: response.usage_metadata.total_token_count if hasattr(response, ‘usage_metadata’) else None,
}
}
except Exception as e:
return {
‘success’: False,
‘error’: str(e),
‘extracted_text’: None
}
def test_excel_extraction(file_path: str, system_instruction: str, prompt: str):
“”"
Complete workflow to test Excel text extraction
Args:
file: Path to the Excel file
system_instruction: System instruction
prompt: Custom prompt
“”"
print(“\n” + “=”*80)
print(“EXCEL TEXT EXTRACTION TEST - GOOGLE GEMINI 2.5 PRO”)
print(“=”*80 + “\n”)
# Step 1: Upload file
print(“[Step 1] Uploading Excel file…”)
file = upload_excel_file(file_path)
# Step 2: Extract text
print(“\n[Step 2] Extracting text from Excel file…”)
result = extract_text_from_excel(file, system_instruction, prompt)
# Step 3: Display results
print(“\n[Step 3] Extraction Results:”)
print(“-” * 80)
if result[‘success’]:
print(f" Extraction successful!")
print(f"\nModel Used: {result[‘model_used’]}")
if result[‘tokens_used’][‘total’]:
print(f"\nToken Usage:")
print(f" - Prompt tokens: {result[‘tokens_used’][‘prompt’]}")
print(f" - Completion tokens: {result[‘tokens_used’][‘completion’]}")
print(f" - Total tokens: {result[‘tokens_used’][‘total’]}")
print(f"\n{‘=’*80}")
print(“EXTRACTED TEXT:”)
print(‘=’*80)
print(result[‘extracted_text’])
print(‘=’*80)
# Save to file
output_file = f"Output Files/{file_path.split(‘/’)[-1].split(‘.’)[0]}_extracted.txt"
with open(output_file, ‘w’, encoding=‘utf-8’) as f:
f.write(result[‘extracted_text’])
print(f"\nExtracted text saved to: {output_file}")
else:
print(f"Extraction failed!")
print(f"Error: {result[‘error’]}")
print(“\n” + “=”*80 + “\n”)
return result
# Example usage
if _name_ == “_main_”:
excel_file = “.\Input Files\Renewal 2025.xlsx”
system_instruction = “”"
|| ROLE ||
You are an expert in extracting information from insurance documents in Excel format with single or multiple sheets.
|| GOAL ||
Your job is to extract all information from the Excel file(s) provided. Preserve ALL original content. Do not make additions, modifications, or removals.
“”"
prompt = “”"
This is an insurance document in Excel format with the possibility of the presence of multiple sheets.
Please extract ALL information from ALL sheets including:
- Sheet names, if any
- All headers and column names
- All data rows
- Any special formatting or notes
- Table structures and relationships
Organize the output clearly by sheet name, if any, and maintain the table structure.
“”"
# Run the test
result = test_excel_extraction(excel_file, system_instruction, prompt)
# Additional analysis
if result[‘success’]:
print(“\n[Analysis]”)
text_length = len(result[‘extracted_text’])
print(f"Total characters extracted: {text_length:,}")
print(f"Total words extracted: {len(result[‘extracted_text’].split()):,}")
I am getting this error:
================================================================================
EXCEL TEXT EXTRACTION TEST - GOOGLE GEMINI 2.5 PRO
[Step 1] Uploading Excel file…
File uploaded successfully
Name: files/wmc0f4byonl3
MIME Type: text/plain
URI: https://generativelanguage.googleapis.com/v1beta/files/wmc0f4byonl3
[Step 2] Extracting text from Excel file…
[Step 3] Extraction Results:
Extraction failed!
Error: 500 INTERNAL. {‘error’: {‘code’: 500, ‘message’: ‘An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting’, ‘status’: ‘INTERNAL’}}
================================================================================
When I am changing the MIME type to:
mime_type=“application/vnd.openxmlformats-officedocument.spreadsheetml.sheet”
I am getting this error instead:
================================================================================
EXCEL TEXT EXTRACTION TEST - GOOGLE GEMINI 2.5 PRO
[Step 1] Uploading Excel file…
Error uploading file: ‘file’
Traceback (most recent call last):
File “C:\Users\USER\Documents\PiTangent Gitlab Repos\Policysmart\FirstTry\excel_read_claude.py”, line 191, in
result = test_excel_extraction(excel_file, system_instruction, prompt)
File “C:\Users\USER\Documents\PiTangent Gitlab Repos\Policysmart\FirstTry\excel_read_claude.py”, line 123, in test_excel_extraction
file = upload_excel_file(file_path)
File “C:\Users\USER\Documents\PiTangent Gitlab Repos\Policysmart\FirstTry\excel_read_claude.py”, line 33, in upload_excel_file
uploaded_file = client.files.upload(
File “C:\Users\USER\Documents\PiTangent Gitlab Repos\Policysmart\FirstTry\venv\lib\site-packages\google\genai\files.py”, line 494, in upload
response=return_file.json[‘file’],
KeyError: 'file