SAM3D / app.py
bhatanerohan's picture
Update app.py
adc638a verified
"""
Text-to-3D Pipeline with Editing: Gemini + SAM-3D
MCP Server + Gradio UI for MCP Hackathon
"""
import os
import io
import json
import tempfile
import gradio as gr
from google import genai
from google.genai import types
from PIL import Image
import modal
# Initialize Gemini client
client = None
def init_gemini():
global client
api_key = os.environ.get("GEMINI_API_KEY")
if api_key:
os.environ["GEMINI_API_KEY"] = api_key
client = genai.Client()
return True
return False
def image_to_bytes(image):
"""Convert PIL Image to PNG bytes"""
buffer = io.BytesIO()
image.save(buffer, format='PNG')
return buffer.getvalue()
def run_sam3d(image, mask):
"""Send image and mask to SAM-3D on Modal"""
img_bytes = image_to_bytes(image.convert("RGB"))
mask_bytes = image_to_bytes(mask)
SAM3DModel = modal.Cls.from_name("sam3d-objects-inference", "SAM3DModel")
model = SAM3DModel()
ply_bytes, glb_bytes = model.reconstruct.remote(img_bytes, mask_bytes)
return ply_bytes, glb_bytes
# ============================================================
# MCP TOOLS - These functions are exposed as MCP tools
# ============================================================
def generate_3d_model(prompt: str) -> str:
"""
Generate a 3D model from a text description.
Args:
prompt: Text description of the object to generate (e.g., "a red sports car", "a wooden chair")
Returns:
JSON string with paths to generated files
"""
if not client:
if not init_gemini():
return json.dumps({"error": "GEMINI_API_KEY not configured"})
try:
# STEP 1: Generate image
initial_prompt = f"{prompt}, three-quarter front view angle, natural daylight, soft shadows showing depth and contours, clean simple background, full object visible, photorealistic"
response_gen = client.models.generate_content(
model="gemini-2.5-flash-image",
contents=[initial_prompt],
)
initial_image = None
for part in response_gen.parts:
if part.inline_data:
image_bytes = part.inline_data.data
initial_image = Image.open(io.BytesIO(image_bytes))
break
if initial_image is None:
return json.dumps({"error": "Image generation failed"})
# STEP 2: Remove background
edit_prompt = "Remove the background completely, make the background transparent. Preserve the object's shadow for realism."
image_part = types.Part.from_bytes(
data=image_to_bytes(initial_image),
mime_type="image/png"
)
response_edit = client.models.generate_content(
model="gemini-3-pro-image-preview",
contents=[edit_prompt, image_part],
)
final_image = None
for part in response_edit.parts:
if part.inline_data:
edited_bytes = part.inline_data.data
final_image = Image.open(io.BytesIO(edited_bytes))
break
if final_image is None:
return json.dumps({"error": "Background removal failed"})
# STEP 3: Create grayscale mask
gray = final_image.convert("L")
# STEP 4: Run SAM-3D
ply_bytes, glb_bytes = run_sam3d(final_image, gray)
# Save all outputs
temp_dir = tempfile.mkdtemp()
original_path = os.path.join(temp_dir, "original.png")
nobg_path = os.path.join(temp_dir, "transparent.png")
mask_path = os.path.join(temp_dir, "mask.png")
ply_path = os.path.join(temp_dir, "model.ply")
initial_image.save(original_path)
final_image.save(nobg_path)
gray.save(mask_path)
with open(ply_path, 'wb') as f:
f.write(ply_bytes)
glb_path = None
if glb_bytes:
glb_path = os.path.join(temp_dir, "model.glb")
with open(glb_path, 'wb') as f:
f.write(glb_bytes)
return json.dumps({
"success": True,
"prompt": prompt,
"original_image": original_path,
"transparent_image": nobg_path,
"mask_image": mask_path,
"ply_model": ply_path,
"glb_model": glb_path,
"message": f"Successfully generated 3D model for: {prompt}"
})
except Exception as e:
return json.dumps({"error": str(e)})
def edit_3d_model(edit_prompt: str, transparent_image_path: str) -> str:
"""
Edit an existing 3D model by modifying its transparent image and regenerating.
Args:
edit_prompt: Description of the edit to apply (e.g., "remove the wings", "change color to blue")
transparent_image_path: Path to the transparent PNG image from a previous generation
Returns:
JSON string with paths to the new edited files
"""
if not client:
if not init_gemini():
return json.dumps({"error": "GEMINI_API_KEY not configured"})
try:
current_image = Image.open(transparent_image_path)
image_part = types.Part.from_bytes(
data=image_to_bytes(current_image),
mime_type="image/png"
)
full_edit_prompt = f"{edit_prompt}. Keep the background transparent. Maintain image quality and lighting."
response_edit = client.models.generate_content(
model="gemini-3-pro-image-preview",
contents=[full_edit_prompt, image_part],
)
edited_image = None
for part in response_edit.parts:
if part.inline_data:
edited_bytes = part.inline_data.data
edited_image = Image.open(io.BytesIO(edited_bytes))
break
if edited_image is None:
return json.dumps({"error": "Edit failed"})
gray = edited_image.convert("L")
ply_bytes, glb_bytes = run_sam3d(edited_image, gray)
temp_dir = tempfile.mkdtemp()
nobg_path = os.path.join(temp_dir, "edited.png")
mask_path = os.path.join(temp_dir, "mask.png")
ply_path = os.path.join(temp_dir, "model.ply")
edited_image.save(nobg_path)
gray.save(mask_path)
with open(ply_path, 'wb') as f:
f.write(ply_bytes)
glb_path = None
if glb_bytes:
glb_path = os.path.join(temp_dir, "model.glb")
with open(glb_path, 'wb') as f:
f.write(glb_bytes)
return json.dumps({
"success": True,
"edit_prompt": edit_prompt,
"transparent_image": nobg_path,
"mask_image": mask_path,
"ply_model": ply_path,
"glb_model": glb_path,
"message": f"Successfully applied edit: {edit_prompt}"
})
except Exception as e:
return json.dumps({"error": str(e)})
# ============================================================
# GRADIO UI FUNCTIONS
# ============================================================
def generate_3d_ui(prompt, progress=gr.Progress()):
"""UI wrapper with progress updates"""
if not client:
if not init_gemini():
raise gr.Error("GEMINI_API_KEY not set in Space secrets")
progress(0.1, desc="Generating image...")
initial_prompt = f"{prompt}, three-quarter front view angle, natural daylight, soft shadows showing depth and contours, clean simple background, full object visible, photorealistic"
try:
response_gen = client.models.generate_content(
model="gemini-2.5-flash-image",
contents=[initial_prompt],
)
initial_image = None
for part in response_gen.parts:
if part.inline_data:
image_bytes = part.inline_data.data
initial_image = Image.open(io.BytesIO(image_bytes))
break
if initial_image is None:
raise gr.Error("Image generation failed")
except Exception as e:
raise gr.Error(f"Image generation failed: {e}")
progress(0.3, desc="Removing background...")
try:
image_part = types.Part.from_bytes(
data=image_to_bytes(initial_image),
mime_type="image/png"
)
response_edit = client.models.generate_content(
model="gemini-3-pro-image-preview",
contents=["Remove the background completely, make the background transparent. Preserve the object's shadow for realism.", image_part],
)
final_image = None
for part in response_edit.parts:
if part.inline_data:
edited_bytes = part.inline_data.data
final_image = Image.open(io.BytesIO(edited_bytes))
break
if final_image is None:
raise gr.Error("Background removal failed")
except Exception as e:
raise gr.Error(f"Background removal failed: {e}")
progress(0.4, desc="Creating mask...")
gray = final_image.convert("L")
progress(0.5, desc="Running SAM-3D (1-2 min, first run may take longer)...")
try:
ply_bytes, glb_bytes = run_sam3d(final_image, gray)
except Exception as e:
raise gr.Error(f"SAM-3D failed: {e}")
progress(0.9, desc="Saving outputs...")
temp_dir = tempfile.mkdtemp()
original_path = os.path.join(temp_dir, "original.png")
nobg_path = os.path.join(temp_dir, "no_background.png")
mask_path = os.path.join(temp_dir, "mask.png")
ply_path = os.path.join(temp_dir, "model.ply")
initial_image.save(original_path)
final_image.save(nobg_path)
gray.save(mask_path)
with open(ply_path, 'wb') as f:
f.write(ply_bytes)
glb_path = None
if glb_bytes:
glb_path = os.path.join(temp_dir, "model.glb")
with open(glb_path, 'wb') as f:
f.write(glb_bytes)
progress(1.0, desc="Done!")
return (
original_path,
nobg_path,
mask_path,
glb_path if glb_path else ply_path,
glb_path,
ply_path,
final_image,
1,
)
def edit_3d_ui(edit_prompt, current_image, edit_count, progress=gr.Progress()):
"""UI wrapper for editing"""
if current_image is None:
raise gr.Error("No image to edit. Generate a 3D model first!")
if not client:
if not init_gemini():
raise gr.Error("GEMINI_API_KEY not set")
progress(0.1, desc=f"Applying edit: {edit_prompt}...")
try:
image_part = types.Part.from_bytes(
data=image_to_bytes(current_image),
mime_type="image/png"
)
full_edit_prompt = f"{edit_prompt}. Keep the background transparent. Maintain image quality and lighting."
response_edit = client.models.generate_content(
model="gemini-3-pro-image-preview",
contents=[full_edit_prompt, image_part],
)
edited_image = None
for part in response_edit.parts:
if part.inline_data:
edited_bytes = part.inline_data.data
edited_image = Image.open(io.BytesIO(edited_bytes))
break
if edited_image is None:
raise gr.Error("Edit failed")
except Exception as e:
raise gr.Error(f"Edit failed: {e}")
progress(0.3, desc="Creating new mask...")
gray = edited_image.convert("L")
progress(0.4, desc="Running SAM-3D (1-2 min)...")
try:
ply_bytes, glb_bytes = run_sam3d(edited_image, gray)
except Exception as e:
raise gr.Error(f"SAM-3D failed: {e}")
progress(0.9, desc="Saving outputs...")
temp_dir = tempfile.mkdtemp()
nobg_path = os.path.join(temp_dir, "edited.png")
mask_path = os.path.join(temp_dir, "mask.png")
ply_path = os.path.join(temp_dir, "model.ply")
edited_image.save(nobg_path)
gray.save(mask_path)
with open(ply_path, 'wb') as f:
f.write(ply_bytes)
glb_path = None
if glb_bytes:
glb_path = os.path.join(temp_dir, "model.glb")
with open(glb_path, 'wb') as f:
f.write(glb_bytes)
new_edit_count = edit_count + 1
progress(1.0, desc=f"Edit #{new_edit_count} complete!")
return (
nobg_path,
mask_path,
glb_path if glb_path else ply_path,
glb_path,
ply_path,
edited_image,
new_edit_count,
)
# ============================================================
# MCP TOOL INTERFACES
# ============================================================
generate_tool = gr.Interface(
fn=generate_3d_model,
inputs=gr.Textbox(label="Prompt", placeholder="A red sports car"),
outputs=gr.Textbox(label="Result (JSON)"),
api_name="generate_3d",
title="Generate 3D Model",
description="Generate a 3D model from a text description"
)
edit_tool = gr.Interface(
fn=edit_3d_model,
inputs=[
gr.Textbox(label="Edit Prompt", placeholder="Remove the wings"),
gr.Textbox(label="Transparent Image Path", placeholder="/path/to/transparent.png")
],
outputs=gr.Textbox(label="Result (JSON)"),
api_name="edit_3d",
title="Edit 3D Model",
description="Edit an existing 3D model"
)
# ============================================================
# MAIN UI
# ============================================================
with gr.Blocks() as main_ui:
current_image_state = gr.State(None)
edit_count_state = gr.State(0)
gr.Markdown("""
# 🎨 Text to 3D Model (MCP Server)
### Powered by Gemini + SAM-3D Objects
**This app is also an MCP Server!** Claude Desktop, Cursor, and other MCP clients can use the `generate_3d` and `edit_3d` tools.
⏱️ *Generation takes 1-2 minutes. First run may take longer as the model warms up.*
""")
gr.Markdown("## 1️⃣ Generate Initial 3D Model")
with gr.Row():
with gr.Column(scale=2):
prompt_input = gr.Textbox(label="Text Prompt", placeholder="A plane with eagle wings", lines=2)
with gr.Column(scale=1):
generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
gr.Examples(
examples=["A plane with eagle wings", "A wooden chair", "A red sports car", "A ceramic coffee mug", "A robot dog"],
inputs=prompt_input
)
gr.Markdown("## 2️⃣ Edit Your Model")
with gr.Row():
with gr.Column(scale=2):
edit_input = gr.Textbox(label="Edit Prompt", placeholder="Remove the wings", lines=2)
with gr.Column(scale=1):
edit_btn = gr.Button("✏️ Apply Edit", variant="secondary", size="lg")
edit_counter = gr.Markdown("*No edits yet*")
gr.Examples(
examples=["Remove the wings", "Change color to blue", "Add racing stripes", "Make it larger", "Add wheels"],
inputs=edit_input
)
gr.Markdown("## 📸 Images")
with gr.Row():
original_output = gr.Image(label="1. Original", type="filepath")
nobg_output = gr.Image(label="2. Transparent", type="filepath")
mask_output = gr.Image(label="3. Mask", type="filepath")
gr.Markdown("## 🎮 3D Model")
model_output = gr.Model3D(label="Interactive 3D Model (drag to rotate)", clear_color=[0.1, 0.1, 0.1, 1.0])
gr.Markdown("## 📥 Downloads")
with gr.Row():
glb_download = gr.File(label="GLB (mesh)")
ply_download = gr.File(label="PLY (splat)")
gr.Markdown("""
---
## 🔌 MCP Server Info
This app exposes two MCP tools: `generate_3d` and `edit_3d`
**Connect via:** `https://YOUR-SPACE.hf.space/gradio_api/mcp/sse`
---
**Built for [MCP 1st Birthday Hackathon](https://huggingface.co/MCP-1st-Birthday)** 🎂
""")
def update_counter(count):
return "*No edits yet*" if count == 0 else f"**Edits applied: {count}**"
generate_btn.click(
fn=generate_3d_ui,
inputs=[prompt_input],
outputs=[original_output, nobg_output, mask_output, model_output, glb_download, ply_download, current_image_state, edit_count_state]
).then(fn=update_counter, inputs=[edit_count_state], outputs=[edit_counter])
edit_btn.click(
fn=edit_3d_ui,
inputs=[edit_input, current_image_state, edit_count_state],
outputs=[nobg_output, mask_output, model_output, glb_download, ply_download, current_image_state, edit_count_state]
).then(fn=update_counter, inputs=[edit_count_state], outputs=[edit_counter])
# ============================================================
# COMBINE UI + MCP TOOLS
# ============================================================
demo = gr.TabbedInterface(
interface_list=[main_ui, generate_tool, edit_tool],
tab_names=["🎨 Interactive UI", "🔧 Generate Tool", "✏️ Edit Tool"],
title="Text to 3D | MCP Server"
)
if __name__ == "__main__":
demo.launch(mcp_server=True)