Hello everyone,
I am working on a project to recreate cinematic poses from reference images using the Google Gen AI SDK (Python). My goal is to achieve a high-fidelity transfer that captures not just the skeletal pose, but also the atmosphere, emotional tension, and micro-expressions (e.g., the physics of a touch or a specific gaze).
My Current Workflow:
-
Analysis: I use a text model (gemini-2.0-flash) to generate a detailed description of the reference image (pose, lighting, interaction) and the source identities.
-
Sequential Generation: I feed these descriptions into the image model. I currently perform a sequential swap:
-
Step 1: Swap “Subject 1” into the reference scene.
-
Step 2: Take the output of Step 1 and swap “Subject 2” into it.
-
The Problem:
While the individual likenesses are okay, the holistic output suffers.
-
Disconnect: Because I generate them sequentially, the lighting and atmosphere often shift between steps.
-
Interaction Physics: The “connection” between subjects (e.g., a hand holding a shoulder, or eye contact) often feels floating or disconnected. The model seems to struggle with the “physics of intimacy” (compression of skin, shared breath/atmosphere) when subjects are processed separately.
-
Instruction Adherence: The model sometimes ignores specific micro-expression instructions (like “tension in the neck” or “heavy eyelids”) in favor of a generic neutral expression.
My Question:
Is the sequential (Step 1 → Step 2) approach the standard best practice for this, or is there a way to prompt for a simultaneous swap to ensure lighting consistency? How can I better prompt the model to strictly adhere to “micro-expression” details without losing the structural pose lock?
Below is a snippet of my current pipeline. Any tips on prompt engineering or architectural changes to improve the “holistic” feel would be appreciated. Below is my code snippet
```
import os
from io import BytesIO
from PIL import Image as PIL_Image
from google import genai
from google.genai.types import (
GenerateContentConfig,
Modality,
HarmCategory,
HarmBlockThreshold,
)
# -----------------------------
# CONFIGURATION
# -----------------------------
API_KEY = “API_KEY”
TEXT_MODEL = “gemini-2.0-flash”
IMAGE_MODEL = “nano-banana-pro” # Using latest available image model
# -----------------------------
# PATHS
# -----------------------------
subject1_img_path = “./input/subject1.jpg”
subject2_img_path = “./input/subject2.jpg”
ref_img_path = “./input/reference_pose.png”
output_base = “./output”
os.makedirs(output_base, exist_ok=True)
client = genai.Client(api_key=API_KEY)
# Standard Safety Settings
safe_config = [
{"category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
{"category": HarmCategory.HARM_CATEGORY_HATE_SPEECH, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
{"category": HarmCategory.HARM_CATEGORY_HARASSMENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
{"category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
]
def extract_image(response):
if not response or not response.candidates: return None
for candidate in response.candidates:
if candidate.finish_reason and str(candidate.finish_reason) == "FinishReason.IMAGE_SAFETY":
return "SAFETY_BLOCK"
if not candidate.content or not candidate.content.parts: continue
for part in candidate.content.parts:
if part.inline_data:
try:
return PIL_Image.open(BytesIO(part.inline_data.data))
except Exception: pass
return None
def run_identity_swap(ref_image, source_image, output_folder, target_subject_name, source_subject_name):
print(f"\\n🔵 STARTING SWAP: {source_subject_name} -> {target_subject_name}")
\# 1. ANALYZE REFERENCE: POSE + INTERACTION PHYSICS
\# I am trying to force the model to understand the 'weight' of the touch
ref_prompt = f"""
Analyze this image for a cinematic scene reconstruction.
Focus on \*\*{target_subject_name}\*\*.
OUTPUT THREE SECTIONS:
1\. SKELETAL GEOMETRY: Exact head tilt, neck rotation, and hand placement.
2\. INTERACTION & PHYSICS:
- Analyze contact points (skin-to-skin).
- Describe pressure (light touch vs. gripping).
- Analyze proximity (distance between faces).
3\. LIGHTING MOOD: Shadows and color grading.
"""
ref_resp = client.models.generate_content(
model=TEXT_MODEL,
contents=\[ref_image, ref_prompt\],
config=GenerateContentConfig(temperature=0)
)
ref_analysis = ref_resp.text
\# 2. ANALYZE SOURCE
source_prompt = "Identify key visual traits: Face shape, skin texture details, lip shape, hair density."
source_resp = client.models.generate_content(
model=TEXT_MODEL,
contents=\[source_image, source_prompt\],
config=GenerateContentConfig(temperature=0.1)
)
source_desc = source_resp.text
\# 3. CONSTRUCT PROMPT
DYNAMIC_TERM = "deep emotional bond, cinematic tension, tender moment"
base_prompt = """
TASK: Cinematic Identity Swap with High-Fidelity Physical Interaction.
TARGET: {target_name}
SOURCE IDENTITY: "{src_desc}"
SCENE BLUEPRINT: "{ref_analysis}"
INSTRUCTIONS:
1\. \*\*IDENTITY:\*\* Apply Source Identity. Preserve skin texture pores.
2\. \*\*POSE:\*\* Match the Blueprint exactly.
3\. \*\*PHYSICS OF TOUCH (CRITICAL):\*\*
- \*\*Compression:\*\* Render the \*weight\* of the hand. Show fingers pressing slightly into skin/clothing.
- \*\*Tension:\*\* Show tension in tendons if holding.
- \*\*No Hovering:\*\* Eliminate gaps between contact points.
4\. \*\*PROXIMITY & MICRO-EXPRESSIONS:\*\*
- If faces are close: Render "shared breath" atmosphere.
- \*\*Micro-Expressions:\*\* Heavy eyelids, dilated pupils, slightly parted lips (if appropriate).
- \*\*Skin Flush:\*\* Add subtle subsurface scattering (redness) to ears/cheeks.
5\. \*\*ATMOSPHERE:\*\*
- Focus on the \*\*{dynamic_term}\*\*.
"""
final_prompt = base_prompt.format(
target_name=target_subject_name,
src_desc=source_desc,
ref_analysis=ref_analysis,
dynamic_term=DYNAMIC_TERM
)
try:
resp = client.models.generate_content(
model=IMAGE_MODEL,
contents=\["REF:", ref_image, "SOURCE:", source_image, final_prompt\],
config=GenerateContentConfig(
response_modalities=\[Modality.IMAGE\],
safety_settings=safe_config,
temperature=0.25
)
)
result = extract_image(resp)
if isinstance(result, PIL_Image.Image):
return result
except Exception as e:
print(f"Error: {e}")
return None
# MAIN EXECUTION
if _name_ == “_main_”:
try:
subject1_source = PIL_Image.open(subject1_img_path)
subject2_source = PIL_Image.open(subject2_img_path)
original_ref = PIL_Image.open(ref_img_path)
\# Step 1: Swap Subject 1
result_phase1 = run_identity_swap(
ref_image=original_ref,
source_image=subject1_source,
output_folder=output_base,
target_subject_name="Subject 1",
source_subject_name="Subject_1_Source"
)
\# Step 2: Swap Subject 2 (using result of Step 1)
if result_phase1:
run_identity_swap(
ref_image=result_phase1,
source_image=subject2_source,
output_folder=output_base,
target_subject_name="Subject 2",
source_subject_name="Subject_2_Source"
)
except Exception as e:
print(f"Setup Error: {e}")
```