Text-to-Image Models in Python — Deep Dive
Production text-to-image systems go beyond calling a single model. This guide covers model evaluation frameworks, multi-model routing, prompt engineering at scale, latency optimization, and the infrastructure patterns teams use to serve image generation reliably.
Model evaluation framework
Before choosing a model, build a systematic evaluation pipeline:
from dataclasses import dataclass
from pathlib import Path
import json
import time
import torch
from PIL import Image
@dataclass
class EvalResult:
model_name: str
prompt: str
generation_time: float
vram_peak_mb: float
clip_score: float
image_path: str
class ModelEvaluator:
def __init__(self, eval_prompts: list[str], output_dir: str = "./eval_output"):
self.prompts = eval_prompts
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
self._load_clip()
def _load_clip(self):
from transformers import CLIPProcessor, CLIPModel
self.clip_model = CLIPModel.from_pretrained(
"openai/clip-vit-large-patch14"
).to("cuda")
self.clip_processor = CLIPProcessor.from_pretrained(
"openai/clip-vit-large-patch14"
)
def compute_clip_score(self, image: Image.Image, prompt: str) -> float:
inputs = self.clip_processor(
text=[prompt], images=image, return_tensors="pt"
).to("cuda")
outputs = self.clip_model(**inputs)
return outputs.logits_per_image.item()
def evaluate_model(self, pipe, model_name: str) -> list[EvalResult]:
results = []
for i, prompt in enumerate(self.prompts):
torch.cuda.reset_peak_memory_stats()
start = time.perf_counter()
image = pipe(prompt, num_inference_steps=30).images[0]
gen_time = time.perf_counter() - start
vram = torch.cuda.max_memory_allocated() / 1e6
clip_score = self.compute_clip_score(image, prompt)
img_path = str(self.output_dir / f"{model_name}_{i}.png")
image.save(img_path)
results.append(EvalResult(
model_name=model_name,
prompt=prompt,
generation_time=gen_time,
vram_peak_mb=vram,
clip_score=clip_score,
image_path=img_path,
))
return results
def compare(self, results_by_model: dict) -> str:
summary = []
for name, results in results_by_model.items():
avg_time = sum(r.generation_time for r in results) / len(results)
avg_clip = sum(r.clip_score for r in results) / len(results)
avg_vram = sum(r.vram_peak_mb for r in results) / len(results)
summary.append(
f"{name}: time={avg_time:.2f}s, "
f"CLIP={avg_clip:.2f}, VRAM={avg_vram:.0f}MB"
)
return "\n".join(summary)
Multi-model routing
Production systems often route requests to different models based on content type:
from enum import Enum
class ContentType(Enum):
PHOTOREALISTIC = "photorealistic"
ARTISTIC = "artistic"
ARCHITECTURAL = "architectural"
CHARACTER = "character"
FAST_PREVIEW = "fast_preview"
class ModelRouter:
def __init__(self):
self.routes = {
ContentType.PHOTOREALISTIC: "stabilityai/stable-diffusion-xl-base-1.0",
ContentType.ARTISTIC: "prompthero/openjourney-v4",
ContentType.ARCHITECTURAL: "stabilityai/stable-diffusion-xl-base-1.0",
ContentType.CHARACTER: "runwayml/stable-diffusion-v1-5",
ContentType.FAST_PREVIEW: "stabilityai/sdxl-turbo",
}
self.model_configs = {
ContentType.PHOTOREALISTIC: {"steps": 35, "guidance": 8.0},
ContentType.ARTISTIC: {"steps": 30, "guidance": 10.0},
ContentType.ARCHITECTURAL: {"steps": 40, "guidance": 7.0},
ContentType.CHARACTER: {"steps": 30, "guidance": 7.5},
ContentType.FAST_PREVIEW: {"steps": 4, "guidance": 1.5},
}
self.loaded_models = {}
def classify_prompt(self, prompt: str) -> ContentType:
"""Simple keyword-based routing. Production systems use a classifier."""
prompt_lower = prompt.lower()
if any(w in prompt_lower for w in ["photo", "realistic", "photograph"]):
return ContentType.PHOTOREALISTIC
if any(w in prompt_lower for w in ["painting", "art", "illustration", "watercolor"]):
return ContentType.ARTISTIC
if any(w in prompt_lower for w in ["building", "architecture", "interior"]):
return ContentType.ARCHITECTURAL
if any(w in prompt_lower for w in ["character", "person", "portrait", "anime"]):
return ContentType.CHARACTER
return ContentType.PHOTOREALISTIC # default
def generate(self, prompt: str, content_type: ContentType = None):
if content_type is None:
content_type = self.classify_prompt(prompt)
model_id = self.routes[content_type]
config = self.model_configs[content_type]
pipe = self._get_or_load(model_id)
return pipe(
prompt,
num_inference_steps=config["steps"],
guidance_scale=config["guidance"],
).images[0]
Prompt optimization at scale
Prompt expansion
Automatically enhance sparse prompts with quality-boosting modifiers:
class PromptOptimizer:
def __init__(self):
self.quality_suffixes = [
"highly detailed", "professional quality",
"sharp focus", "8k resolution",
]
self.style_map = {
"photo": "DSLR photograph, natural lighting, bokeh",
"art": "digital art, trending on artstation, vibrant colors",
"sketch": "pencil drawing, detailed linework, crosshatching",
}
self.negative_base = (
"blurry, low quality, distorted, deformed, "
"watermark, text, signature, jpeg artifacts"
)
def optimize(
self, prompt: str, style: str = "photo",
add_quality: bool = True,
) -> tuple[str, str]:
enhanced = prompt
if style in self.style_map:
enhanced = f"{enhanced}, {self.style_map[style]}"
if add_quality:
enhanced = f"{enhanced}, {', '.join(self.quality_suffixes[:2])}"
return enhanced, self.negative_base
def batch_optimize(self, prompts: list[str], style: str = "photo"):
return [self.optimize(p, style) for p in prompts]
A/B testing prompt variations
import random
from collections import defaultdict
class PromptABTester:
def __init__(self, pipe, evaluator):
self.pipe = pipe
self.evaluator = evaluator
self.results = defaultdict(list)
def test_variations(
self, base_prompt: str,
variations: dict[str, str], # name -> modified prompt
samples_per_variation: int = 5,
):
for name, prompt in variations.items():
for i in range(samples_per_variation):
seed = random.randint(0, 2**32)
generator = torch.Generator("cuda").manual_seed(seed)
image = self.pipe(
prompt,
generator=generator,
num_inference_steps=30,
).images[0]
score = self.evaluator.compute_clip_score(image, base_prompt)
self.results[name].append(score)
# Report
for name, scores in self.results.items():
avg = sum(scores) / len(scores)
print(f"{name}: avg CLIP score = {avg:.3f}")
Latency optimization techniques
Token merging (ToMe)
Reduces computation by merging redundant tokens in the transformer:
import tomesd
pipe = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
).to("cuda")
tomesd.apply_patch(pipe, ratio=0.5) # Merge 50% of tokens
# 40-60% speedup with minimal quality loss
DeepCache
Caches intermediate U-Net features across denoising steps:
from DeepCache import DeepCacheSDHelper
helper = DeepCacheSDHelper(pipe=pipe)
helper.set_params(cache_interval=3, cache_branch_id=0)
helper.enable()
image = pipe("landscape", num_inference_steps=30).images[0]
# ~2.5x speedup by reusing cached features
Compiled inference
pipe.unet = torch.compile(pipe.unet, mode="max-autotune", fullgraph=True)
pipe.vae.decode = torch.compile(pipe.vae.decode, mode="max-autotune", fullgraph=True)
Serving architecture
Multi-GPU serving with load balancing
import asyncio
from collections import deque
class MultiGPUServer:
def __init__(self, model_id: str, gpu_ids: list[int]):
self.pipes = {}
for gpu_id in gpu_ids:
pipe = StableDiffusionXLPipeline.from_pretrained(
model_id, torch_dtype=torch.float16
).to(f"cuda:{gpu_id}")
pipe.enable_xformers_memory_efficient_attention()
self.pipes[gpu_id] = pipe
self.available_gpus = deque(gpu_ids)
self.lock = asyncio.Lock()
async def generate(self, prompt: str, **kwargs):
async with self.lock:
while not self.available_gpus:
await asyncio.sleep(0.1)
gpu_id = self.available_gpus.popleft()
try:
pipe = self.pipes[gpu_id]
result = pipe(prompt, **kwargs).images[0]
return result
finally:
async with self.lock:
self.available_gpus.append(gpu_id)
Streaming progress updates
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import json
app = FastAPI()
@app.post("/generate/stream")
async def generate_stream(prompt: str):
async def event_stream():
total_steps = 30
def callback(pipe, step, timestep, kwargs):
progress = {
"step": step,
"total": total_steps,
"percent": round(step / total_steps * 100),
}
return kwargs
yield f"data: {json.dumps({'status': 'started'})}\n\n"
image = pipe(
prompt,
num_inference_steps=total_steps,
callback_on_step_end=callback,
).images[0]
# Encode final image
import base64
from io import BytesIO
buffer = BytesIO()
image.save(buffer, format="PNG")
img_b64 = base64.b64encode(buffer.getvalue()).decode()
yield f"data: {json.dumps({'status': 'complete', 'image': img_b64})}\n\n"
return StreamingResponse(event_stream(), media_type="text/event-stream")
Cost and latency benchmarks (2024 data)
| Model | Hardware | Time per image | Cost per image |
|---|---|---|---|
| SD 1.5 (30 steps) | RTX 3090 | 2.5s | ~$0.001 |
| SDXL (30 steps) | A100 40GB | 4.5s | ~$0.005 |
| SDXL Turbo (4 steps) | RTX 3090 | 0.8s | ~$0.0003 |
| DALL-E 3 (API) | Cloud | 8–15s | $0.040–0.080 |
| Midjourney (API) | Cloud | 15–30s | ~$0.05 |
The 100x cost difference between self-hosted and API-based generation is the primary reason teams invest in building their own infrastructure.
One thing to remember: Production text-to-image systems combine model evaluation, intelligent routing, prompt optimization, and multi-GPU serving — and the architecture decisions around which model handles which content type, how to optimize latency, and when to use APIs versus self-hosted determine both cost and quality outcomes.
See Also
- Diffusion Models Stable Diffusion and DALL-E don't 'draw' your images — they unspoil a scrambled mess until a picture emerges. Here's the surprisingly simple idea behind it.
- Python Controlnet Image Control Find out how ControlNet lets you boss around an AI artist by giving it sketches, poses, and outlines to follow.
- Python Gan Training Patterns Learn how two neural networks compete like an art forger and a detective to create incredibly realistic fake images.
- Python Image Generation Pipelines Discover how Python chains together multiple steps to turn your ideas into polished AI-generated images, like a factory assembly line for pictures.
- Python Image Inpainting Learn how Python can magically fill in missing parts of a photo, like erasing something and having the picture fix itself.