Text-to-Image Models in Python — Deep Dive

Architect text-to-image systems in Python — model evaluation frameworks, multi-model routing, prompt optimization, and production deployment patterns.

Production text-to-image systems go beyond calling a single model. This guide covers model evaluation frameworks, multi-model routing, prompt engineering at scale, latency optimization, and the infrastructure patterns teams use to serve image generation reliably.

Model evaluation framework

Before choosing a model, build a systematic evaluation pipeline:

from dataclasses import dataclass
from pathlib import Path
import json
import time
import torch
from PIL import Image

@dataclass
class EvalResult:
    model_name: str
    prompt: str
    generation_time: float
    vram_peak_mb: float
    clip_score: float
    image_path: str

class ModelEvaluator:
    def __init__(self, eval_prompts: list[str], output_dir: str = "./eval_output"):
        self.prompts = eval_prompts
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self._load_clip()
    
    def _load_clip(self):
        from transformers import CLIPProcessor, CLIPModel
        self.clip_model = CLIPModel.from_pretrained(
            "openai/clip-vit-large-patch14"
        ).to("cuda")
        self.clip_processor = CLIPProcessor.from_pretrained(
            "openai/clip-vit-large-patch14"
        )
    
    def compute_clip_score(self, image: Image.Image, prompt: str) -> float:
        inputs = self.clip_processor(
            text=[prompt], images=image, return_tensors="pt"
        ).to("cuda")
        outputs = self.clip_model(**inputs)
        return outputs.logits_per_image.item()
    
    def evaluate_model(self, pipe, model_name: str) -> list[EvalResult]:
        results = []
        for i, prompt in enumerate(self.prompts):
            torch.cuda.reset_peak_memory_stats()
            
            start = time.perf_counter()
            image = pipe(prompt, num_inference_steps=30).images[0]
            gen_time = time.perf_counter() - start
            
            vram = torch.cuda.max_memory_allocated() / 1e6
            clip_score = self.compute_clip_score(image, prompt)
            
            img_path = str(self.output_dir / f"{model_name}_{i}.png")
            image.save(img_path)
            
            results.append(EvalResult(
                model_name=model_name,
                prompt=prompt,
                generation_time=gen_time,
                vram_peak_mb=vram,
                clip_score=clip_score,
                image_path=img_path,
            ))
        
        return results
    
    def compare(self, results_by_model: dict) -> str:
        summary = []
        for name, results in results_by_model.items():
            avg_time = sum(r.generation_time for r in results) / len(results)
            avg_clip = sum(r.clip_score for r in results) / len(results)
            avg_vram = sum(r.vram_peak_mb for r in results) / len(results)
            summary.append(
                f"{name}: time={avg_time:.2f}s, "
                f"CLIP={avg_clip:.2f}, VRAM={avg_vram:.0f}MB"
            )
        return "\n".join(summary)

Multi-model routing

Production systems often route requests to different models based on content type:

from enum import Enum

class ContentType(Enum):
    PHOTOREALISTIC = "photorealistic"
    ARTISTIC = "artistic"
    ARCHITECTURAL = "architectural"
    CHARACTER = "character"
    FAST_PREVIEW = "fast_preview"

class ModelRouter:
    def __init__(self):
        self.routes = {
            ContentType.PHOTOREALISTIC: "stabilityai/stable-diffusion-xl-base-1.0",
            ContentType.ARTISTIC: "prompthero/openjourney-v4",
            ContentType.ARCHITECTURAL: "stabilityai/stable-diffusion-xl-base-1.0",
            ContentType.CHARACTER: "runwayml/stable-diffusion-v1-5",
            ContentType.FAST_PREVIEW: "stabilityai/sdxl-turbo",
        }
        self.model_configs = {
            ContentType.PHOTOREALISTIC: {"steps": 35, "guidance": 8.0},
            ContentType.ARTISTIC: {"steps": 30, "guidance": 10.0},
            ContentType.ARCHITECTURAL: {"steps": 40, "guidance": 7.0},
            ContentType.CHARACTER: {"steps": 30, "guidance": 7.5},
            ContentType.FAST_PREVIEW: {"steps": 4, "guidance": 1.5},
        }
        self.loaded_models = {}
    
    def classify_prompt(self, prompt: str) -> ContentType:
        """Simple keyword-based routing. Production systems use a classifier."""
        prompt_lower = prompt.lower()
        
        if any(w in prompt_lower for w in ["photo", "realistic", "photograph"]):
            return ContentType.PHOTOREALISTIC
        if any(w in prompt_lower for w in ["painting", "art", "illustration", "watercolor"]):
            return ContentType.ARTISTIC
        if any(w in prompt_lower for w in ["building", "architecture", "interior"]):
            return ContentType.ARCHITECTURAL
        if any(w in prompt_lower for w in ["character", "person", "portrait", "anime"]):
            return ContentType.CHARACTER
        
        return ContentType.PHOTOREALISTIC  # default
    
    def generate(self, prompt: str, content_type: ContentType = None):
        if content_type is None:
            content_type = self.classify_prompt(prompt)
        
        model_id = self.routes[content_type]
        config = self.model_configs[content_type]
        pipe = self._get_or_load(model_id)
        
        return pipe(
            prompt,
            num_inference_steps=config["steps"],
            guidance_scale=config["guidance"],
        ).images[0]

Prompt optimization at scale

Prompt expansion

Automatically enhance sparse prompts with quality-boosting modifiers:

class PromptOptimizer:
    def __init__(self):
        self.quality_suffixes = [
            "highly detailed", "professional quality",
            "sharp focus", "8k resolution",
        ]
        self.style_map = {
            "photo": "DSLR photograph, natural lighting, bokeh",
            "art": "digital art, trending on artstation, vibrant colors",
            "sketch": "pencil drawing, detailed linework, crosshatching",
        }
        self.negative_base = (
            "blurry, low quality, distorted, deformed, "
            "watermark, text, signature, jpeg artifacts"
        )
    
    def optimize(
        self, prompt: str, style: str = "photo",
        add_quality: bool = True,
    ) -> tuple[str, str]:
        enhanced = prompt
        
        if style in self.style_map:
            enhanced = f"{enhanced}, {self.style_map[style]}"
        
        if add_quality:
            enhanced = f"{enhanced}, {', '.join(self.quality_suffixes[:2])}"
        
        return enhanced, self.negative_base
    
    def batch_optimize(self, prompts: list[str], style: str = "photo"):
        return [self.optimize(p, style) for p in prompts]

A/B testing prompt variations

import random
from collections import defaultdict

class PromptABTester:
    def __init__(self, pipe, evaluator):
        self.pipe = pipe
        self.evaluator = evaluator
        self.results = defaultdict(list)
    
    def test_variations(
        self, base_prompt: str,
        variations: dict[str, str],  # name -> modified prompt
        samples_per_variation: int = 5,
    ):
        for name, prompt in variations.items():
            for i in range(samples_per_variation):
                seed = random.randint(0, 2**32)
                generator = torch.Generator("cuda").manual_seed(seed)
                
                image = self.pipe(
                    prompt,
                    generator=generator,
                    num_inference_steps=30,
                ).images[0]
                
                score = self.evaluator.compute_clip_score(image, base_prompt)
                self.results[name].append(score)
        
        # Report
        for name, scores in self.results.items():
            avg = sum(scores) / len(scores)
            print(f"{name}: avg CLIP score = {avg:.3f}")

Latency optimization techniques

Token merging (ToMe)

Reduces computation by merging redundant tokens in the transformer:

import tomesd

pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
).to("cuda")

tomesd.apply_patch(pipe, ratio=0.5)  # Merge 50% of tokens
# 40-60% speedup with minimal quality loss

DeepCache

Caches intermediate U-Net features across denoising steps:

from DeepCache import DeepCacheSDHelper

helper = DeepCacheSDHelper(pipe=pipe)
helper.set_params(cache_interval=3, cache_branch_id=0)
helper.enable()

image = pipe("landscape", num_inference_steps=30).images[0]
# ~2.5x speedup by reusing cached features

Compiled inference

pipe.unet = torch.compile(pipe.unet, mode="max-autotune", fullgraph=True)
pipe.vae.decode = torch.compile(pipe.vae.decode, mode="max-autotune", fullgraph=True)

Serving architecture

Multi-GPU serving with load balancing

import asyncio
from collections import deque

class MultiGPUServer:
    def __init__(self, model_id: str, gpu_ids: list[int]):
        self.pipes = {}
        for gpu_id in gpu_ids:
            pipe = StableDiffusionXLPipeline.from_pretrained(
                model_id, torch_dtype=torch.float16
            ).to(f"cuda:{gpu_id}")
            pipe.enable_xformers_memory_efficient_attention()
            self.pipes[gpu_id] = pipe
        
        self.available_gpus = deque(gpu_ids)
        self.lock = asyncio.Lock()
    
    async def generate(self, prompt: str, **kwargs):
        async with self.lock:
            while not self.available_gpus:
                await asyncio.sleep(0.1)
            gpu_id = self.available_gpus.popleft()
        
        try:
            pipe = self.pipes[gpu_id]
            result = pipe(prompt, **kwargs).images[0]
            return result
        finally:
            async with self.lock:
                self.available_gpus.append(gpu_id)

Streaming progress updates

from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import json

app = FastAPI()

@app.post("/generate/stream")
async def generate_stream(prompt: str):
    async def event_stream():
        total_steps = 30
        
        def callback(pipe, step, timestep, kwargs):
            progress = {
                "step": step,
                "total": total_steps,
                "percent": round(step / total_steps * 100),
            }
            return kwargs
        
        yield f"data: {json.dumps({'status': 'started'})}\n\n"
        
        image = pipe(
            prompt,
            num_inference_steps=total_steps,
            callback_on_step_end=callback,
        ).images[0]
        
        # Encode final image
        import base64
        from io import BytesIO
        buffer = BytesIO()
        image.save(buffer, format="PNG")
        img_b64 = base64.b64encode(buffer.getvalue()).decode()
        
        yield f"data: {json.dumps({'status': 'complete', 'image': img_b64})}\n\n"
    
    return StreamingResponse(event_stream(), media_type="text/event-stream")

Cost and latency benchmarks (2024 data)

Model	Hardware	Time per image	Cost per image
SD 1.5 (30 steps)	RTX 3090	2.5s	~$0.001
SDXL (30 steps)	A100 40GB	4.5s	~$0.005
SDXL Turbo (4 steps)	RTX 3090	0.8s	~$0.0003
DALL-E 3 (API)	Cloud	8–15s	$0.040–0.080
Midjourney (API)	Cloud	15–30s	~$0.05

The 100x cost difference between self-hosted and API-based generation is the primary reason teams invest in building their own infrastructure.

One thing to remember: Production text-to-image systems combine model evaluation, intelligent routing, prompt optimization, and multi-GPU serving — and the architecture decisions around which model handles which content type, how to optimize latency, and when to use APIs versus self-hosted determine both cost and quality outcomes.

pythontext-to-imagegenerative-aideep-learning