DEV Community

Cover image for Python dataclasses: Cleaner Code Than Dicts or NamedTuples
German Yamil
German Yamil

Posted on

Python dataclasses: Cleaner Code Than Dicts or NamedTuples

Python dataclasses: Cleaner Code Than Dicts or NamedTuples

If you're passing dicts around your codebase and writing data["status"] everywhere, dataclasses are the upgrade you're looking for.

They give you: type hints, default values, auto-generated __init__, __repr__, and __eq__ — all in a few lines.


🎁 Free: AI Publishing Checklist — 7 steps in Python · Full pipeline: germy5.gumroad.com/l/xhxkzz (pay what you want, min $9.99)


The problem with dicts

# Dict — no type info, no IDE autocomplete, typos silently fail
task = {
    "id": "task-01",
    "name": "Generate chapter",
    "status": "pending",
    "retries": 0,
}

# This typo won't be caught until runtime
print(task["statsu"])  # KeyError
print(task.get("statsu"))  # None — silently wrong

# Functions become unclear
def process(task: dict) -> dict:  # what's in it? who knows
    task["status"] = "done"
    return task
Enter fullscreen mode Exit fullscreen mode

Dataclasses solve this:

from dataclasses import dataclass

@dataclass
class Task:
    id: str
    name: str
    status: str = "pending"
    retries: int = 0

task = Task(id="task-01", name="Generate chapter")
print(task.status)    # "pending" — IDE can autocomplete this
print(task.statsu)    # AttributeError — caught immediately
Enter fullscreen mode Exit fullscreen mode

The basics

from dataclasses import dataclass, field
from typing import Optional

@dataclass
class PipelineTask:
    """Represents one task in the automation pipeline."""
    id: str
    name: str
    code: str
    status: str = "pending"
    retries: int = 0
    error: Optional[str] = None
    tags: list[str] = field(default_factory=list)

# Create
task = PipelineTask(id="t01", name="Print hello", code="print('hi')")
print(task)
# PipelineTask(id='t01', name='Print hello', code="print('hi')", status='pending', retries=0, error=None, tags=[])

# Modify
task.status = "done"
task.tags.append("validated")

# Equality works automatically
t1 = PipelineTask(id="t01", name="x", code="y")
t2 = PipelineTask(id="t01", name="x", code="y")
print(t1 == t2)  # True — compares field values, not identity
Enter fullscreen mode Exit fullscreen mode

Important: Don't use mutable defaults directly. Use field(default_factory=list) for lists, dicts, and sets:

# Wrong — all instances share the same list
@dataclass
class Bad:
    items: list = []  # ValueError at runtime

# Right
@dataclass
class Good:
    items: list[str] = field(default_factory=list)
    meta: dict = field(default_factory=dict)
Enter fullscreen mode Exit fullscreen mode

field() — control each field

from dataclasses import dataclass, field
from datetime import datetime

@dataclass
class Article:
    title: str
    body: str

    # Hidden from __repr__ (don't leak tokens in logs)
    api_token: str = field(default="", repr=False)

    # Excluded from __init__ — computed internally
    word_count: int = field(default=0, init=False)

    # Excluded from equality comparison
    created_at: datetime = field(default_factory=datetime.now, compare=False)

    def __post_init__(self):
        # Runs after __init__ — use for derived values
        self.word_count = len(self.body.split())

a = Article(title="My Ebook", body="Hello world this is a test")
print(a.word_count)   # 6
print(repr(a))        # api_token not shown
Enter fullscreen mode Exit fullscreen mode

__post_init__ — validation and computed fields

from dataclasses import dataclass, field

@dataclass
class ValidationResult:
    passed: bool
    exit_code: int
    stdout: str
    stderr: str
    timed_out: bool = False

    # Computed field
    has_output: bool = field(init=False)

    def __post_init__(self):
        self.has_output = bool(self.stdout.strip())

        # Validation
        if self.exit_code < -1:
            raise ValueError(f"Invalid exit code: {self.exit_code}")

    def __bool__(self):
        return self.passed

result = ValidationResult(passed=True, exit_code=0, stdout="hello\n", stderr="")
print(bool(result))       # True
print(result.has_output)  # True
Enter fullscreen mode Exit fullscreen mode

Frozen dataclasses — immutable objects

from dataclasses import dataclass

@dataclass(frozen=True)
class Config:
    """Immutable config — safe to use as dict key or in sets."""
    model: str
    max_tokens: int
    temperature: float = 0.7

cfg = Config(model="claude-3-5-sonnet", max_tokens=4096)
# cfg.model = "gpt-4"  # FrozenInstanceError — can't modify

# Can be used as dict key or in a set
cache = {cfg: "result"}
configs = {cfg}
Enter fullscreen mode Exit fullscreen mode

Dataclass inheritance

from dataclasses import dataclass
from typing import Optional

@dataclass
class BaseTask:
    id: str
    name: str
    status: str = "pending"

@dataclass
class CodeTask(BaseTask):
    """Task that runs Python code."""
    code: str = ""
    timeout: int = 30

@dataclass
class PublishTask(BaseTask):
    """Task that publishes an article."""
    filename: str = ""
    cover_url: Optional[str] = None

code_task = CodeTask(id="c01", name="Run script", code="print('hi')")
pub_task  = PublishTask(id="p01", name="Publish article", filename="post.md")

print(code_task.status)    # "pending" — inherited
print(isinstance(code_task, BaseTask))  # True
Enter fullscreen mode Exit fullscreen mode

JSON: to and from dict

Dataclasses don't serialize to JSON automatically, but it's a two-liner:

from dataclasses import dataclass, asdict, astuple
import json

@dataclass
class Task:
    id: str
    name: str
    status: str = "pending"

task = Task(id="t01", name="Generate chapter")

# To dict
d = asdict(task)
print(d)  # {'id': 't01', 'name': 'Generate chapter', 'status': 'pending'}

# To JSON
text = json.dumps(asdict(task), indent=2)

# From dict
data = json.loads(text)
restored = Task(**data)
print(restored == task)  # True
Enter fullscreen mode Exit fullscreen mode

For nested dataclasses, asdict() recursively converts them too:

@dataclass
class Chapter:
    id: str
    title: str
    word_count: int

@dataclass  
class Ebook:
    title: str
    chapters: list[Chapter]

ebook = Ebook(title="My Book", chapters=[Chapter("ch01", "Intro", 1200)])
print(json.dumps(asdict(ebook), indent=2))
# {"title": "My Book", "chapters": [{"id": "ch01", "title": "Intro", "word_count": 1200}]}
Enter fullscreen mode Exit fullscreen mode

slots=True — faster, smaller objects

Python 3.10+ supports slots=True for a small performance win:

from dataclasses import dataclass

@dataclass(slots=True)  # Python 3.10+
class TaskResult:
    task_id: str
    passed: bool
    stdout: str
    stderr: str

# Uses __slots__ instead of __dict__ — faster attribute access, less memory
# Trade-off: can't add arbitrary attributes at runtime
Enter fullscreen mode Exit fullscreen mode

Use slots=True when you create many instances in a loop (like processing 10,000 tasks).

Dataclass vs dict vs NamedTuple: when to use each

dict NamedTuple dataclass
Mutable ✅ (or ❌ with frozen=True)
Type hints
Default values partial
Inheritance
__post_init__
JSON (direct) with ._asdict() with asdict()
IDE autocomplete
Dict key only if frozen=True

Use dict when: you don't know the keys ahead of time (API responses, config merging).

Use NamedTuple when: you need an immutable, hashable, index-accessible tuple-like structure.

Use dataclass when: you have a fixed schema, need validation, mutation, or inheritance.

Real-world example: pipeline state machine

from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Optional
import json

@dataclass
class TaskState:
    """State of one task in the automation pipeline."""
    id: str
    name: str
    status: str = "pending"       # pending | running | done | failed
    retries: int = 0
    max_retries: int = 3
    error: Optional[str] = None
    started_at: Optional[str] = None
    completed_at: Optional[str] = None

    def start(self) -> None:
        self.status = "running"
        self.started_at = datetime.now().isoformat()

    def complete(self) -> None:
        self.status = "done"
        self.completed_at = datetime.now().isoformat()

    def fail(self, error: str) -> None:
        self.retries += 1
        self.error = error
        if self.retries >= self.max_retries:
            self.status = "failed"
        else:
            self.status = "pending"  # will retry

    @property
    def should_run(self) -> bool:
        return self.status == "pending"

    @classmethod
    def from_dict(cls, data: dict) -> "TaskState":
        return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})


# Usage
tasks = [
    TaskState(id="t01", name="Validate syntax"),
    TaskState(id="t02", name="Run subprocess"),
    TaskState(id="t03", name="Publish article"),
]

# Save state
state_data = [asdict(t) for t in tasks]
with open("pipeline_state.json", "w") as f:
    json.dump(state_data, f, indent=2)

# Load state
with open("pipeline_state.json") as f:
    loaded = [TaskState.from_dict(d) for d in json.load(f)]

# Process
for task in loaded:
    if task.should_run:
        task.start()
        try:
            # ... do the work ...
            task.complete()
        except Exception as e:
            task.fail(str(e))
Enter fullscreen mode Exit fullscreen mode

The full pipeline uses dataclasses for ValidationResult, ChapterState, and PublishRecord — cleaner than dicts and safer than NamedTuples: germy5.gumroad.com/l/xhxkzz — pay what you want, min $9.99.


Further Reading

Top comments (0)