Python dataclasses: Cleaner Code Than Dicts or NamedTuples
If you're passing dicts around your codebase and writing data["status"] everywhere, dataclasses are the upgrade you're looking for.
They give you: type hints, default values, auto-generated __init__, __repr__, and __eq__ — all in a few lines.
🎁 Free: AI Publishing Checklist — 7 steps in Python · Full pipeline: germy5.gumroad.com/l/xhxkzz (pay what you want, min $9.99)
The problem with dicts
# Dict — no type info, no IDE autocomplete, typos silently fail
task = {
"id": "task-01",
"name": "Generate chapter",
"status": "pending",
"retries": 0,
}
# This typo won't be caught until runtime
print(task["statsu"]) # KeyError
print(task.get("statsu")) # None — silently wrong
# Functions become unclear
def process(task: dict) -> dict: # what's in it? who knows
task["status"] = "done"
return task
Dataclasses solve this:
from dataclasses import dataclass
@dataclass
class Task:
id: str
name: str
status: str = "pending"
retries: int = 0
task = Task(id="task-01", name="Generate chapter")
print(task.status) # "pending" — IDE can autocomplete this
print(task.statsu) # AttributeError — caught immediately
The basics
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class PipelineTask:
"""Represents one task in the automation pipeline."""
id: str
name: str
code: str
status: str = "pending"
retries: int = 0
error: Optional[str] = None
tags: list[str] = field(default_factory=list)
# Create
task = PipelineTask(id="t01", name="Print hello", code="print('hi')")
print(task)
# PipelineTask(id='t01', name='Print hello', code="print('hi')", status='pending', retries=0, error=None, tags=[])
# Modify
task.status = "done"
task.tags.append("validated")
# Equality works automatically
t1 = PipelineTask(id="t01", name="x", code="y")
t2 = PipelineTask(id="t01", name="x", code="y")
print(t1 == t2) # True — compares field values, not identity
Important: Don't use mutable defaults directly. Use field(default_factory=list) for lists, dicts, and sets:
# Wrong — all instances share the same list
@dataclass
class Bad:
items: list = [] # ValueError at runtime
# Right
@dataclass
class Good:
items: list[str] = field(default_factory=list)
meta: dict = field(default_factory=dict)
field() — control each field
from dataclasses import dataclass, field
from datetime import datetime
@dataclass
class Article:
title: str
body: str
# Hidden from __repr__ (don't leak tokens in logs)
api_token: str = field(default="", repr=False)
# Excluded from __init__ — computed internally
word_count: int = field(default=0, init=False)
# Excluded from equality comparison
created_at: datetime = field(default_factory=datetime.now, compare=False)
def __post_init__(self):
# Runs after __init__ — use for derived values
self.word_count = len(self.body.split())
a = Article(title="My Ebook", body="Hello world this is a test")
print(a.word_count) # 6
print(repr(a)) # api_token not shown
__post_init__ — validation and computed fields
from dataclasses import dataclass, field
@dataclass
class ValidationResult:
passed: bool
exit_code: int
stdout: str
stderr: str
timed_out: bool = False
# Computed field
has_output: bool = field(init=False)
def __post_init__(self):
self.has_output = bool(self.stdout.strip())
# Validation
if self.exit_code < -1:
raise ValueError(f"Invalid exit code: {self.exit_code}")
def __bool__(self):
return self.passed
result = ValidationResult(passed=True, exit_code=0, stdout="hello\n", stderr="")
print(bool(result)) # True
print(result.has_output) # True
Frozen dataclasses — immutable objects
from dataclasses import dataclass
@dataclass(frozen=True)
class Config:
"""Immutable config — safe to use as dict key or in sets."""
model: str
max_tokens: int
temperature: float = 0.7
cfg = Config(model="claude-3-5-sonnet", max_tokens=4096)
# cfg.model = "gpt-4" # FrozenInstanceError — can't modify
# Can be used as dict key or in a set
cache = {cfg: "result"}
configs = {cfg}
Dataclass inheritance
from dataclasses import dataclass
from typing import Optional
@dataclass
class BaseTask:
id: str
name: str
status: str = "pending"
@dataclass
class CodeTask(BaseTask):
"""Task that runs Python code."""
code: str = ""
timeout: int = 30
@dataclass
class PublishTask(BaseTask):
"""Task that publishes an article."""
filename: str = ""
cover_url: Optional[str] = None
code_task = CodeTask(id="c01", name="Run script", code="print('hi')")
pub_task = PublishTask(id="p01", name="Publish article", filename="post.md")
print(code_task.status) # "pending" — inherited
print(isinstance(code_task, BaseTask)) # True
JSON: to and from dict
Dataclasses don't serialize to JSON automatically, but it's a two-liner:
from dataclasses import dataclass, asdict, astuple
import json
@dataclass
class Task:
id: str
name: str
status: str = "pending"
task = Task(id="t01", name="Generate chapter")
# To dict
d = asdict(task)
print(d) # {'id': 't01', 'name': 'Generate chapter', 'status': 'pending'}
# To JSON
text = json.dumps(asdict(task), indent=2)
# From dict
data = json.loads(text)
restored = Task(**data)
print(restored == task) # True
For nested dataclasses, asdict() recursively converts them too:
@dataclass
class Chapter:
id: str
title: str
word_count: int
@dataclass
class Ebook:
title: str
chapters: list[Chapter]
ebook = Ebook(title="My Book", chapters=[Chapter("ch01", "Intro", 1200)])
print(json.dumps(asdict(ebook), indent=2))
# {"title": "My Book", "chapters": [{"id": "ch01", "title": "Intro", "word_count": 1200}]}
slots=True — faster, smaller objects
Python 3.10+ supports slots=True for a small performance win:
from dataclasses import dataclass
@dataclass(slots=True) # Python 3.10+
class TaskResult:
task_id: str
passed: bool
stdout: str
stderr: str
# Uses __slots__ instead of __dict__ — faster attribute access, less memory
# Trade-off: can't add arbitrary attributes at runtime
Use slots=True when you create many instances in a loop (like processing 10,000 tasks).
Dataclass vs dict vs NamedTuple: when to use each
dict |
NamedTuple |
dataclass |
|
|---|---|---|---|
| Mutable | ✅ | ❌ | ✅ (or ❌ with frozen=True) |
| Type hints | ❌ | ✅ | ✅ |
| Default values | ❌ | partial | ✅ |
| Inheritance | ❌ | ❌ | ✅ |
__post_init__ |
❌ | ❌ | ✅ |
| JSON (direct) | ✅ | with ._asdict()
|
with asdict()
|
| IDE autocomplete | ❌ | ✅ | ✅ |
| Dict key | ❌ | ✅ | only if frozen=True
|
Use dict when: you don't know the keys ahead of time (API responses, config merging).
Use NamedTuple when: you need an immutable, hashable, index-accessible tuple-like structure.
Use dataclass when: you have a fixed schema, need validation, mutation, or inheritance.
Real-world example: pipeline state machine
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional
import json
@dataclass
class TaskState:
"""State of one task in the automation pipeline."""
id: str
name: str
status: str = "pending" # pending | running | done | failed
retries: int = 0
max_retries: int = 3
error: Optional[str] = None
started_at: Optional[str] = None
completed_at: Optional[str] = None
def start(self) -> None:
self.status = "running"
self.started_at = datetime.now().isoformat()
def complete(self) -> None:
self.status = "done"
self.completed_at = datetime.now().isoformat()
def fail(self, error: str) -> None:
self.retries += 1
self.error = error
if self.retries >= self.max_retries:
self.status = "failed"
else:
self.status = "pending" # will retry
@property
def should_run(self) -> bool:
return self.status == "pending"
@classmethod
def from_dict(cls, data: dict) -> "TaskState":
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
# Usage
tasks = [
TaskState(id="t01", name="Validate syntax"),
TaskState(id="t02", name="Run subprocess"),
TaskState(id="t03", name="Publish article"),
]
# Save state
state_data = [asdict(t) for t in tasks]
with open("pipeline_state.json", "w") as f:
json.dump(state_data, f, indent=2)
# Load state
with open("pipeline_state.json") as f:
loaded = [TaskState.from_dict(d) for d in json.load(f)]
# Process
for task in loaded:
if task.should_run:
task.start()
try:
# ... do the work ...
task.complete()
except Exception as e:
task.fail(str(e))
The full pipeline uses dataclasses for ValidationResult, ChapterState, and PublishRecord — cleaner than dicts and safer than NamedTuples: germy5.gumroad.com/l/xhxkzz — pay what you want, min $9.99.
Top comments (0)