Image Generation Pipelines in Python — Deep Dive
Building production image generation pipelines means going beyond default diffusers calls. This guide covers custom pipeline construction, latent-space manipulation, advanced guidance techniques, ensemble strategies, and scalable architecture patterns.
Custom pipeline from components
The DiffusionPipeline base class provides structure, but you can build entirely custom pipelines by wiring components directly:
import torch
from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
class CustomGenerationPipeline:
def __init__(self, model_id: str, device: str = "cuda"):
self.device = device
self.tokenizer = CLIPTokenizer.from_pretrained(
model_id, subfolder="tokenizer"
)
self.text_encoder = CLIPTextModel.from_pretrained(
model_id, subfolder="text_encoder"
).to(device)
self.unet = UNet2DConditionModel.from_pretrained(
model_id, subfolder="unet", torch_dtype=torch.float16
).to(device)
self.vae = AutoencoderKL.from_pretrained(
model_id, subfolder="vae", torch_dtype=torch.float16
).to(device)
self.scheduler = DDIMScheduler.from_pretrained(
model_id, subfolder="scheduler"
)
def encode_prompt(self, prompt: str, negative_prompt: str = ""):
tokens = self.tokenizer(
prompt, padding="max_length",
max_length=77, truncation=True,
return_tensors="pt",
).input_ids.to(self.device)
neg_tokens = self.tokenizer(
negative_prompt, padding="max_length",
max_length=77, truncation=True,
return_tensors="pt",
).input_ids.to(self.device)
with torch.no_grad():
prompt_embeds = self.text_encoder(tokens)[0]
neg_embeds = self.text_encoder(neg_tokens)[0]
return torch.cat([neg_embeds, prompt_embeds])
@torch.no_grad()
def generate(
self,
prompt: str,
negative_prompt: str = "",
steps: int = 30,
guidance_scale: float = 7.5,
width: int = 512,
height: int = 512,
seed: int = None,
):
embeddings = self.encode_prompt(prompt, negative_prompt)
generator = torch.Generator(self.device)
if seed is not None:
generator.manual_seed(seed)
latents = torch.randn(
(1, 4, height // 8, width // 8),
generator=generator,
device=self.device,
dtype=torch.float16,
)
self.scheduler.set_timesteps(steps)
latents = latents * self.scheduler.init_noise_sigma
for t in self.scheduler.timesteps:
latent_input = torch.cat([latents] * 2)
latent_input = self.scheduler.scale_model_input(latent_input, t)
noise_pred = self.unet(
latent_input, t, encoder_hidden_states=embeddings
).sample
noise_uncond, noise_text = noise_pred.chunk(2)
noise_pred = noise_uncond + guidance_scale * (
noise_text - noise_uncond
)
latents = self.scheduler.step(noise_pred, t, latents).prev_sample
# Decode latents to image
latents = latents / self.vae.config.scaling_factor
image = self.vae.decode(latents).sample
image = (image / 2 + 0.5).clamp(0, 1)
image = image.cpu().permute(0, 2, 3, 1).numpy()
from PIL import Image
return Image.fromarray((image[0] * 255).astype("uint8"))
Building from components gives you complete control over every step, enabling techniques impossible with the standard pipeline.
Dynamic classifier-free guidance
Instead of a fixed guidance scale, vary it across denoising steps. High guidance early establishes composition; lower guidance later preserves natural details:
def dynamic_guidance_schedule(
step: int, total_steps: int,
start_scale: float = 12.0,
end_scale: float = 5.0,
) -> float:
"""Cosine decay from high to low guidance."""
import math
progress = step / total_steps
return end_scale + (start_scale - end_scale) * (
1 + math.cos(math.pi * progress)
) / 2
# In the generation loop:
for i, t in enumerate(scheduler.timesteps):
scale = dynamic_guidance_schedule(i, len(scheduler.timesteps))
noise_pred = noise_uncond + scale * (noise_text - noise_uncond)
This produces images with strong prompt adherence and natural-looking textures simultaneously.
Latent space manipulation
Latent interpolation between prompts
Generate smooth transitions between concepts:
def interpolate_prompts(
pipeline, prompt_a: str, prompt_b: str,
steps: int = 10, seed: int = 42
):
emb_a = pipeline.encode_prompt(prompt_a, "")
emb_b = pipeline.encode_prompt(prompt_b, "")
generator = torch.Generator("cuda").manual_seed(seed)
shared_latent = torch.randn(
(1, 4, 64, 64), generator=generator,
device="cuda", dtype=torch.float16
)
images = []
for i in range(steps + 1):
alpha = i / steps
# Spherical interpolation for better results
emb = slerp(alpha, emb_a, emb_b)
image = pipeline.generate_from_embedding(
emb, initial_latent=shared_latent
)
images.append(image)
return images
def slerp(t, v0, v1, dot_threshold=0.9995):
"""Spherical linear interpolation."""
dot = torch.sum(v0 * v1) / (torch.norm(v0) * torch.norm(v1))
if abs(dot) > dot_threshold:
return (1 - t) * v0 + t * v1
theta = torch.acos(dot)
return (torch.sin((1 - t) * theta) * v0 + torch.sin(t * theta) * v1) / torch.sin(theta)
Latent composition
Combine latent regions from different prompts to create composite scenes:
def compose_latents(
pipeline,
regions: list, # [(prompt, mask, weight), ...]
steps: int = 30,
guidance_scale: float = 7.5,
):
"""Generate image with different prompts controlling different regions."""
embeddings = [
pipeline.encode_prompt(r[0], "") for r in regions
]
masks = [r[1] for r in regions] # Binary masks in latent space
weights = [r[2] for r in regions]
latents = torch.randn(
(1, 4, 64, 64), device="cuda", dtype=torch.float16
)
pipeline.scheduler.set_timesteps(steps)
for t in pipeline.scheduler.timesteps:
noise_preds = []
for emb in embeddings:
latent_input = torch.cat([latents] * 2)
pred = pipeline.unet(latent_input, t, encoder_hidden_states=emb).sample
uncond, text = pred.chunk(2)
noise_preds.append(uncond + guidance_scale * (text - uncond))
# Weighted combination using masks
combined = torch.zeros_like(noise_preds[0])
for pred, mask, weight in zip(noise_preds, masks, weights):
combined += pred * mask * weight
latents = pipeline.scheduler.step(combined, t, latents).prev_sample
return pipeline.decode_latents(latents)
Ensemble denoising
Use multiple U-Net checkpoints during generation for improved quality:
class EnsemblePipeline:
def __init__(self, model_ids: list, weights: list = None):
self.unets = [
UNet2DConditionModel.from_pretrained(
mid, subfolder="unet", torch_dtype=torch.float16
).to("cuda")
for mid in model_ids
]
self.weights = weights or [1.0 / len(model_ids)] * len(model_ids)
def predict_noise(self, latents, timestep, encoder_hidden_states):
predictions = []
for unet, weight in zip(self.unets, self.weights):
pred = unet(
latents, timestep,
encoder_hidden_states=encoder_hidden_states,
).sample
predictions.append(pred * weight)
return sum(predictions)
This technique leverages the strengths of different fine-tuned models — one might excel at faces, another at landscapes.
Production pipeline architecture
Request-based pipeline with caching
from dataclasses import dataclass, field
from typing import Optional
import hashlib
import json
@dataclass
class GenerationRequest:
prompt: str
negative_prompt: str = ""
width: int = 512
height: int = 512
steps: int = 30
guidance_scale: float = 7.5
seed: int = -1
model: str = "sd-1.5"
lora: Optional[str] = None
controlnet: Optional[str] = None
control_image: Optional[str] = None
@property
def cache_key(self) -> str:
data = {
"prompt": self.prompt,
"negative_prompt": self.negative_prompt,
"width": self.width,
"height": self.height,
"steps": self.steps,
"guidance_scale": self.guidance_scale,
"seed": self.seed,
"model": self.model,
"lora": self.lora,
}
return hashlib.sha256(
json.dumps(data, sort_keys=True).encode()
).hexdigest()[:16]
class PipelineOrchestrator:
def __init__(self):
self.models = {}
self.cache = {} # LRU cache of recent generations
def get_pipeline(self, request: GenerationRequest):
key = request.model
if key not in self.models:
self.models[key] = self._load_model(request.model)
pipe = self.models[key]
if request.lora:
pipe.load_lora_weights(request.lora)
return pipe
def generate(self, request: GenerationRequest):
# Check cache
if request.seed >= 0 and request.cache_key in self.cache:
return self.cache[request.cache_key]
pipe = self.get_pipeline(request)
generator = None
if request.seed >= 0:
generator = torch.Generator("cuda").manual_seed(request.seed)
result = pipe(
request.prompt,
negative_prompt=request.negative_prompt,
width=request.width,
height=request.height,
num_inference_steps=request.steps,
guidance_scale=request.guidance_scale,
generator=generator,
).images[0]
if request.seed >= 0:
self.cache[request.cache_key] = result
return result
Pipeline with quality gates
Add automated quality checks between stages:
class QualityGatedPipeline:
def __init__(self, pipe, nsfw_threshold=0.8, aesthetic_threshold=5.0):
self.pipe = pipe
self.nsfw_threshold = nsfw_threshold
self.aesthetic_threshold = aesthetic_threshold
def generate_with_gates(self, prompt, max_attempts=3, **kwargs):
for attempt in range(max_attempts):
image = self.pipe(prompt, **kwargs).images[0]
# Gate 1: NSFW check
if self._is_nsfw(image):
continue
# Gate 2: Aesthetic quality score
score = self._aesthetic_score(image)
if score < self.aesthetic_threshold:
# Retry with different seed
kwargs["generator"] = torch.Generator("cuda").manual_seed(
attempt * 1000
)
continue
return {"image": image, "score": score, "attempts": attempt + 1}
return {"image": None, "error": "Failed quality gates"}
Profiling pipeline bottlenecks
import time
class ProfiledPipeline:
def __init__(self, pipe):
self.pipe = pipe
self.timings = {}
def generate(self, prompt, **kwargs):
# Profile each stage
t0 = time.perf_counter()
tokens = self.pipe.tokenizer(prompt, return_tensors="pt")
self.timings["tokenization"] = time.perf_counter() - t0
t0 = time.perf_counter()
result = self.pipe(prompt, **kwargs)
total = time.perf_counter() - t0
self.timings["total"] = total
self.timings["per_step"] = total / kwargs.get("num_inference_steps", 50)
return result
def report(self):
for stage, duration in self.timings.items():
print(f"{stage}: {duration:.3f}s")
Typical bottleneck distribution: U-Net denoising takes 85–90% of total time, VAE decoding 5–10%, text encoding 1–2%.
One thing to remember: Production pipelines are built from composable components — custom denoising loops, latent-space manipulations, quality gates, and caching layers — and understanding the data flow between components lets you optimize the 90% bottleneck (U-Net denoising) without touching the rest.
See Also
- Diffusion Models Stable Diffusion and DALL-E don't 'draw' your images — they unspoil a scrambled mess until a picture emerges. Here's the surprisingly simple idea behind it.
- Python Controlnet Image Control Find out how ControlNet lets you boss around an AI artist by giving it sketches, poses, and outlines to follow.
- Python Gan Training Patterns Learn how two neural networks compete like an art forger and a detective to create incredibly realistic fake images.
- Python Image Inpainting Learn how Python can magically fill in missing parts of a photo, like erasing something and having the picture fix itself.
- Python Lora Fine Tuning Learn how LoRA lets you teach an AI new tricks without replacing its entire brain, using tiny add-on lessons instead.