Best practices for Multi-Subject Pose Transfer and Interaction Physics using Gemini API?

Hello everyone,

I am working on a project to recreate cinematic poses from reference images using the Google Gen AI SDK (Python). My goal is to achieve a high-fidelity transfer that captures not just the skeletal pose, but also the atmosphere, emotional tension, and micro-expressions (e.g., the physics of a touch or a specific gaze).

My Current Workflow:

  1. Analysis: I use a text model (gemini-2.0-flash) to generate a detailed description of the reference image (pose, lighting, interaction) and the source identities.

  2. Sequential Generation: I feed these descriptions into the image model. I currently perform a sequential swap:

    • Step 1: Swap “Subject 1” into the reference scene.

    • Step 2: Take the output of Step 1 and swap “Subject 2” into it.

The Problem:
While the individual likenesses are okay, the holistic output suffers.

  • Disconnect: Because I generate them sequentially, the lighting and atmosphere often shift between steps.

  • Interaction Physics: The “connection” between subjects (e.g., a hand holding a shoulder, or eye contact) often feels floating or disconnected. The model seems to struggle with the “physics of intimacy” (compression of skin, shared breath/atmosphere) when subjects are processed separately.

  • Instruction Adherence: The model sometimes ignores specific micro-expression instructions (like “tension in the neck” or “heavy eyelids”) in favor of a generic neutral expression.

My Question:
Is the sequential (Step 1 → Step 2) approach the standard best practice for this, or is there a way to prompt for a simultaneous swap to ensure lighting consistency? How can I better prompt the model to strictly adhere to “micro-expression” details without losing the structural pose lock?

Below is a snippet of my current pipeline. Any tips on prompt engineering or architectural changes to improve the “holistic” feel would be appreciated. Below is my code snippet
```

import os

from io import BytesIO

from PIL import Image as PIL_Image

from google import genai

from google.genai.types import (

GenerateContentConfig,

Modality,

HarmCategory,

HarmBlockThreshold,

)

# -----------------------------

# CONFIGURATION

# -----------------------------

API_KEY = “API_KEY”

TEXT_MODEL = “gemini-2.0-flash”

IMAGE_MODEL = “nano-banana-pro” # Using latest available image model

# -----------------------------

# PATHS

# -----------------------------

subject1_img_path = “./input/subject1.jpg”

subject2_img_path = “./input/subject2.jpg”

ref_img_path = “./input/reference_pose.png”

output_base = “./output”

os.makedirs(output_base, exist_ok=True)

client = genai.Client(api_key=API_KEY)

# Standard Safety Settings

safe_config = [

{"category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},

{"category": HarmCategory.HARM_CATEGORY_HATE_SPEECH, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},

{"category": HarmCategory.HARM_CATEGORY_HARASSMENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},

{"category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},

]

def extract_image(response):

if not response or not response.candidates: return None

for candidate in response.candidates:

    if candidate.finish_reason and str(candidate.finish_reason) == "FinishReason.IMAGE_SAFETY":

        return "SAFETY_BLOCK"

    if not candidate.content or not candidate.content.parts: continue

    for part in candidate.content.parts:

        if part.inline_data:

            try:

                return PIL_Image.open(BytesIO(part.inline_data.data))

            except Exception: pass

return None

def run_identity_swap(ref_image, source_image, output_folder, target_subject_name, source_subject_name):

print(f"\\n🔵 STARTING SWAP: {source_subject_name} -> {target_subject_name}")

\# 1. ANALYZE REFERENCE: POSE + INTERACTION PHYSICS

\# I am trying to force the model to understand the 'weight' of the touch

ref_prompt = f"""

Analyze this image for a cinematic scene reconstruction.

Focus on \*\*{target_subject_name}\*\*.



OUTPUT THREE SECTIONS:

1\. SKELETAL GEOMETRY: Exact head tilt, neck rotation, and hand placement.

2\. INTERACTION & PHYSICS: 

   - Analyze contact points (skin-to-skin).

   - Describe pressure (light touch vs. gripping).

   - Analyze proximity (distance between faces).

3\. LIGHTING MOOD: Shadows and color grading.

"""



ref_resp = client.models.generate_content(

    model=TEXT_MODEL,

    contents=\[ref_image, ref_prompt\],

    config=GenerateContentConfig(temperature=0)

)

ref_analysis = ref_resp.text

\# 2. ANALYZE SOURCE

source_prompt = "Identify key visual traits: Face shape, skin texture details, lip shape, hair density."

source_resp = client.models.generate_content(

    model=TEXT_MODEL,

    contents=\[source_image, source_prompt\],

    config=GenerateContentConfig(temperature=0.1)

)

source_desc = source_resp.text

\# 3. CONSTRUCT PROMPT

DYNAMIC_TERM = "deep emotional bond, cinematic tension, tender moment"

base_prompt = """

TASK: Cinematic Identity Swap with High-Fidelity Physical Interaction.



TARGET: {target_name}

SOURCE IDENTITY: "{src_desc}"

SCENE BLUEPRINT: "{ref_analysis}"



INSTRUCTIONS:

1\. \*\*IDENTITY:\*\* Apply Source Identity. Preserve skin texture pores.

2\. \*\*POSE:\*\* Match the Blueprint exactly.



3\. \*\*PHYSICS OF TOUCH (CRITICAL):\*\*

   - \*\*Compression:\*\* Render the \*weight\* of the hand. Show fingers pressing slightly into skin/clothing.

   - \*\*Tension:\*\* Show tension in tendons if holding.

   - \*\*No Hovering:\*\* Eliminate gaps between contact points.



4\. \*\*PROXIMITY & MICRO-EXPRESSIONS:\*\*

   - If faces are close: Render "shared breath" atmosphere.

   - \*\*Micro-Expressions:\*\* Heavy eyelids, dilated pupils, slightly parted lips (if appropriate).

   - \*\*Skin Flush:\*\* Add subtle subsurface scattering (redness) to ears/cheeks.

   

5\. \*\*ATMOSPHERE:\*\*

   - Focus on the \*\*{dynamic_term}\*\*.

"""

final_prompt = base_prompt.format(

    target_name=target_subject_name,

    src_desc=source_desc,

    ref_analysis=ref_analysis,

    dynamic_term=DYNAMIC_TERM

)

try:

    resp = client.models.generate_content(

        model=IMAGE_MODEL,

        contents=\["REF:", ref_image, "SOURCE:", source_image, final_prompt\],

        config=GenerateContentConfig(

            response_modalities=\[Modality.IMAGE\], 

            safety_settings=safe_config, 

            temperature=0.25

        )

    )

    result = extract_image(resp)

    if isinstance(result, PIL_Image.Image):

        return result

except Exception as e:

    print(f"Error: {e}")

return None

# MAIN EXECUTION

if _name_ == “_main_”:

try:

    subject1_source = PIL_Image.open(subject1_img_path)

    subject2_source = PIL_Image.open(subject2_img_path)

    original_ref = PIL_Image.open(ref_img_path)

    \# Step 1: Swap Subject 1

    result_phase1 = run_identity_swap(

        ref_image=original_ref,

        source_image=subject1_source,

        output_folder=output_base,

        target_subject_name="Subject 1",

        source_subject_name="Subject_1_Source"

    )

    \# Step 2: Swap Subject 2 (using result of Step 1)

    if result_phase1:

        run_identity_swap(

            ref_image=result_phase1,

            source_image=subject2_source,

            output_folder=output_base,

            target_subject_name="Subject 2",

            source_subject_name="Subject_2_Source"

        )

except Exception as e:

    print(f"Setup Error: {e}")

```