Table of Contents
For a while now, I’ve been fascinated by the idea of creating high-quality images purely from text. The growing capabilities of generative models like Stable Diffusion had me curious — how exactly does one go from a simple sentence to a detailed piece of art?
This weekend, I decided to roll up my sleeves and try it out.
🔧 The Setup: Building a Reproducible SDXL Pipeline
I started by creating a Python notebook on Google Colab, designed to be reproducible, efficient, and GPU-aware. Here's what the pipeline does:
- Mounts Google Drive to store models and outputs persistently
- Sets up local caching directories for huggingface_hub, diffusers, and PyTorch wheels
- Downloads required packages (like diffusers, transformers, accelerate) ahead of time into a local wheelhouse for fast, dependency-safe installs
- Detects GPU availability and automatically selects:
- stabilityai/stable-diffusion-xl-base-1.0 (SDXL) if CUDA is available
- runwayml/stable-diffusion-v1-5 as a fallback on CPU (work in progress)
- Initializes a DiffusionPipeline with memory-efficient options like vae_slicing and attention_slicing
- Generates an image using a carefully designed prompt and saves it to Drive
🖼️ The Prompt
prompt = (
"Ultra-detailed portrait of a mysterious traveler walking through "
"a neon-lit cyberpunk city at night, reflective puddles, cinematic "
"lighting, intricate textures, hyper-realistic, depth of field"
)
neg = "blurry, distorted, watermark, text, extra limbs"
💡 Why This Was Exciting
While tools like Midjourney and DALL·E abstract away the complexity, building your own Stable Diffusion pipeline gives you:
- Full control over versions, dependencies, and parameters
- A better understanding of how diffusion models work under the hood
- A great foundation for future projects like fine-tuning, LoRA, or prompt chaining
🧠 What I Learned
- SDXL is powerful — but managing model size and inference speed is key
- Prompt design is half the battle — you need to balance creativity with CLIP token limits (~77 tokens)
- Using Hugging Face + Google Drive makes it easy to cache, persist, and share your experiments
💡 The Output
Image - 1
💡 Notebook Code
# ╔═══════════════════════════════════════════════════════════════╗
# ║ 0. MOUNT DRIVE + PREP CACHE FOLDERS ║
# ╚═══════════════════════════════════════════════════════════════╝
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
import os, pathlib, subprocess, textwrap
DRIVE_ROOT = "/content/drive/MyDrive/Study/ai_models" # 👈 change if you like
HF_CACHE = f"{DRIVE_ROOT}/hf_cache"
WHEELHOUSE = f"{DRIVE_ROOT}/wheelhouse"
for p in (HF_CACHE, WHEELHOUSE):
pathlib.Path(p).mkdir(parents=True, exist_ok=True)
# Redirect every 🤗 cache to Drive
os.environ["HF_HOME"] = HF_CACHE
os.environ["HUGGINGFACE_HUB_CACHE"] = HF_CACHE
os.environ["TRANSFORMERS_CACHE"] = HF_CACHE
os.environ["DIFFUSERS_CACHE"] = HF_CACHE
# ╔═══════════════════════════════════════════════════════════════╗
# ║ 1. DEFINE EXACT VERSIONS ║
# ╚═══════════════════════════════════════════════════════════════╝
TORCH_WHEEL = "torch==2.6.0+cu124" # matches Colab's CUDA 12.4 tool-chain
EXTRAS = [
"torchvision==0.21.0+cu124",
"torchaudio==2.6.0+cu124",
]
PKGS = [ # all installed with --no-deps later
"diffusers==0.27.2",
"transformers==4.34.0",
"accelerate==1.9.0",
"huggingface_hub==0.24.1", # provides cached_download
"safetensors==0.5.3",
"invisible_watermark==0.2.0",
"tokenizers==0.14.1"
]
# ╔═══════════════════════════════════════════════════════════════╗
# ║ 2. DOWNLOAD WHEELS ONCE (SKIPPED IF THEY EXIST) ║
# ╚═══════════════════════════════════════════════════════════════╝
def wheel_present(name_version: str) -> bool:
name, ver = name_version.split("==")
return bool(list(pathlib.Path(WHEELHOUSE).glob(f"{name}-{ver}*.whl")))
# Torch + friends come from the PyTorch wheel index
if not wheel_present("torch==2.6.0+cu124"):
!pip download $TORCH_WHEEL {" ".join(EXTRAS)} -d "$WHEELHOUSE" \
--index-url https://download.pytorch.org/whl/cu124
# Other packages from PyPI
for pkg in PKGS:
if not wheel_present(pkg):
!pip download $pkg -d "$WHEELHOUSE"
# ╔═══════════════════════════════════════════════════════════════╗
# ║ 3. INSTALL FROM LOCAL WHEELS ONLY ║
# ╚═══════════════════════════════════════════════════════════════╝
# 3-a Torch first (allows ABI-compatible extras)
!pip install --quiet --no-index --find-links="$WHEELHOUSE" \
$TORCH_WHEEL {" ".join(EXTRAS)}
# 3-b Everything else, but **--no-deps** so nothing tries to upgrade torch
!pip install --quiet --no-index --find-links="$WHEELHOUSE" --no-deps \
diffusers==0.27.2 transformers==4.34.0 accelerate==1.9.0 \
huggingface_hub==0.24.1 tokenizers==0.14.1 \
safetensors==0.5.3 invisible_watermark==0.2.0
!pip uninstall -y peft
# ╔══════════════════════════════════════════════════════════════════╗
# ║ 1. LOAD A PIPELINE – SDXL on GPU, SD-v1.5 on CPU fallback ║
# ╚══════════════════════════════════════════════════════════════════╝
import importlib.metadata, torch
print("torch :", torch.__version__)
print("diffusers :", importlib.metadata.version("diffusers"))
print("hub :", importlib.metadata.version("huggingface_hub"))
# should show a consistent trio, e.g.:
# torch 2.6.0+cu124 | diffusers 0.29.x | hub 0.25+
from diffusers import DiffusionPipeline
has_cuda = torch.cuda.is_available()
if has_cuda:
MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0"
DTYPE = torch.float16
DEVICE = "cuda"
LOAD_KW = {} # GPU loads everything
else:
MODEL_ID = "runwayml/stable-diffusion-v1-5"
DTYPE = torch.float32
DEVICE = "cpu"
LOAD_KW = dict(device_map="balanced", max_memory={"cpu": "10GiB"})
pipe = DiffusionPipeline.from_pretrained(
MODEL_ID,
torch_dtype=DTYPE,
use_safetensors=True,
**LOAD_KW,
).to(DEVICE)
# VRAM / RAM savers
if has_cuda:
#pipe.enable_xformers_memory_efficient_attention()
pipe.enable_vae_slicing()
else:
pipe.enable_attention_slicing()
pipe.enable_vae_slicing()
# ╔══════════════════════════════════════════════════════════════════╗
# ║ 2. GENERATE A TEST IMAGE ║
# ╚══════════════════════════════════════════════════════════════════╝
prompt = ("Ultra-detailed portrait of a mysterious traveler walking through "
"a neon-lit cyberpunk city at night, reflective puddles, cinematic "
"lighting, intricate textures, hyper-realistic, depth of field")
neg = "blurry, distorted, watermark, text, extra limbs"
height = width = 1024 if has_cuda else 512
steps = 30 if has_cuda else 20
image = pipe(
prompt = prompt,
negative_prompt = neg,
height = height,
width = width,
num_inference_steps = steps,
guidance_scale = 7.5,
generator = torch.Generator(DEVICE).manual_seed(42),
).images[0]
# ╔══════════════════════════════════════════════════════════════════╗
# ║ 3. SAVE TO DRIVE ║
# ╚══════════════════════════════════════════════════════════════════╝
out_dir = pathlib.Path(DRIVE_ROOT) / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)
fname = out_dir / f"traveler_{'sdxl' if has_cuda else 'v15'}.png"
image.save(fname)
print(f"✅ Render complete → {fname}")
image # displays inline in Colab
Top comments (0)