I got tired of taking meeting notes. So I built a system that records meetings, transcribes them with OpenAI Whisper, and generates structured summaries with Claude — complete with action items, decisions, and follow-ups. Here is the complete technical walkthrough.
System Architecture
Audio Input -> Whisper Transcription -> Claude Summary Engine -> Output Router
├── Slack channel post
├── Notion page creation
├── Email to attendees
└── JSON for CRM
Step 1: Audio Capture and Transcription
First, we capture audio and run it through Whisper for transcription. I use the local Whisper model for privacy — no audio leaves your machine.
import whisper
import subprocess
import tempfile
from pathlib import Path
from dataclasses import dataclass
@dataclass
class TranscriptSegment:
start: float
end: float
text: str
speaker: str | None = None
@dataclass
class Transcript:
segments: list[TranscriptSegment]
full_text: str
duration_seconds: float
language: str
class AudioTranscriber:
def __init__(self, model_size: str = "medium"):
self.model = whisper.load_model(model_size)
def transcribe_file(self, audio_path: str) -> Transcript:
"""Transcribe an audio file and return structured segments."""
result = self.model.transcribe(
audio_path,
language="en",
verbose=False,
word_timestamps=True,
condition_on_previous_text=True
)
segments = [
TranscriptSegment(
start=seg["start"],
end=seg["end"],
text=seg["text"].strip()
)
for seg in result["segments"]
]
return Transcript(
segments=segments,
full_text=result["text"].strip(),
duration_seconds=segments[-1].end if segments else 0,
language=result["language"]
)
def transcribe_from_system_audio(self, duration_seconds: int) -> Transcript:
"""Record system audio (macOS) and transcribe."""
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
tmp_path = f.name
subprocess.run([
"sox", "-d", "-r", "16000", "-c", "1",
tmp_path, "trim", "0", str(duration_seconds)
], check=True)
transcript = self.transcribe_file(tmp_path)
Path(tmp_path).unlink()
return transcript
Step 2: Speaker Diarization
Raw Whisper output does not identify speakers. We add speaker diarization using pyannote.audio:
from pyannote.audio import Pipeline
import torch
class SpeakerDiarizer:
def __init__(self, hf_token: str):
self.pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=hf_token
)
if torch.cuda.is_available():
self.pipeline.to(torch.device("cuda"))
def assign_speakers(
self, audio_path: str, transcript: Transcript
) -> Transcript:
"""Run diarization and assign speakers to transcript segments."""
diarization = self.pipeline(audio_path)
for segment in transcript.segments:
mid_time = (segment.start + segment.end) / 2
speaker = self._get_speaker_at_time(diarization, mid_time)
segment.speaker = speaker
return transcript
def _get_speaker_at_time(self, diarization, time: float) -> str:
for turn, _, speaker in diarization.itertracks(yield_label=True):
if turn.start <= time <= turn.end:
return speaker
return "UNKNOWN"
Step 3: Claude Summary Engine
This is where the magic happens. We send the diarized transcript to Claude with a structured extraction prompt:
import anthropic
import json
from datetime import datetime
@dataclass
class MeetingSummary:
title: str
date: str
duration: str
attendees: list[str]
executive_summary: str
key_decisions: list[dict]
action_items: list[dict]
discussion_topics: list[dict]
follow_ups: list[dict]
raw_transcript: str
class SummaryEngine:
def __init__(self, api_key: str):
self.client = anthropic.Anthropic(api_key=api_key)
def generate_summary(self, transcript: Transcript) -> MeetingSummary:
formatted = self._format_transcript(transcript)
prompt = f"""Analyze this meeting transcript and extract a structured summary.
Return a JSON object with these fields:
- title: A descriptive meeting title (infer from content)
- attendees: List of speaker identifiers mentioned or detected
- executive_summary: 2-3 sentence high-level summary
- key_decisions: Array of objects with "decision", "made_by", "context"
- action_items: Array of objects with "task", "owner", "deadline" (infer if not stated), "priority" (high/medium/low)
- discussion_topics: Array of objects with "topic", "summary", "outcome"
- follow_ups: Array of objects with "item", "owner", "due_date"
- sentiment: Overall meeting tone (productive/contentious/neutral/brainstorming)
Transcript:
{formatted}
Return ONLY valid JSON."""
response = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
messages=[{"role": "user", "content": prompt}]
)
data = json.loads(response.content[0].text)
duration_min = int(transcript.duration_seconds / 60)
return MeetingSummary(
title=data["title"],
date=datetime.now().strftime("%Y-%m-%d"),
duration=f"{duration_min} minutes",
attendees=data["attendees"],
executive_summary=data["executive_summary"],
key_decisions=data["key_decisions"],
action_items=data["action_items"],
discussion_topics=data["discussion_topics"],
follow_ups=data["follow_ups"],
raw_transcript=formatted
)
def _format_transcript(self, transcript: Transcript) -> str:
lines = []
for seg in transcript.segments:
timestamp = f"[{self._fmt_time(seg.start)}]"
speaker = seg.speaker or "Speaker"
lines.append(f"{timestamp} {speaker}: {seg.text}")
return "\n".join(lines)
def _fmt_time(self, seconds: float) -> str:
m, s = divmod(int(seconds), 60)
h, m = divmod(m, 60)
return f"{h:02d}:{m:02d}:{s:02d}"
Step 4: Output Distribution
Once we have the structured summary, we push it to multiple destinations:
import requests
import dataclasses
class OutputDistributor:
def __init__(self, config: dict):
self.slack_webhook = config.get("slack_webhook")
self.notion_token = config.get("notion_token")
self.notion_db = config.get("notion_database_id")
def to_slack(self, summary: MeetingSummary) -> None:
"""Post formatted summary to Slack channel."""
action_list = "\n".join(
f" - [ ] {a['task']} -> *{a['owner']}* (due: {a.get('deadline', 'TBD')})"
for a in summary.action_items
)
decision_list = "\n".join(
f" - {d['decision']}"
for d in summary.key_decisions
)
blocks = [
{"type": "header", "text": {"type": "plain_text", "text": summary.title}},
{"type": "section", "text": {"type": "mrkdwn", "text": summary.executive_summary}},
{"type": "section", "text": {"type": "mrkdwn", "text": f"*Decisions:*\n{decision_list}"}},
{"type": "section", "text": {"type": "mrkdwn", "text": f"*Action Items:*\n{action_list}"}},
]
requests.post(self.slack_webhook, json={"blocks": blocks})
def to_json(self, summary: MeetingSummary, output_path: str) -> None:
"""Export as JSON for CRM or other system integration."""
with open(output_path, "w") as f:
json.dump(dataclasses.asdict(summary), f, indent=2)
Putting It All Together
def process_meeting(audio_path: str, config: dict) -> MeetingSummary:
# 1. Transcribe
transcriber = AudioTranscriber(model_size="medium")
transcript = transcriber.transcribe_file(audio_path)
print(f"Transcribed {transcript.duration_seconds:.0f}s of audio")
# 2. Identify speakers
diarizer = SpeakerDiarizer(hf_token=config["hf_token"])
transcript = diarizer.assign_speakers(audio_path, transcript)
speakers = set(s.speaker for s in transcript.segments if s.speaker)
print(f"Identified {len(speakers)} speakers")
# 3. Generate summary
engine = SummaryEngine(api_key=config["anthropic_key"])
summary = engine.generate_summary(transcript)
print(f"Generated summary: {summary.title}")
# 4. Distribute
distributor = OutputDistributor(config)
distributor.to_slack(summary)
distributor.to_json(summary, f"meeting_{summary.date}.json")
print("Distributed to Slack and JSON")
return summary
Real-World Results
After deploying this for a 12-person engineering team:
| Metric | Before | After |
|---|---|---|
| Note-taking time | 15 min/meeting | 0 min |
| Action item capture rate | ~60% | 95%+ |
| "What did we decide?" Slack messages | 8/week | 0/week |
| Meeting summary delivery time | 2-24 hours | < 2 minutes |
The system pays for itself in the first week. The Whisper transcription runs locally (free), and Claude API costs are roughly $0.03-0.08 per meeting depending on length.
Production Tips
- Use Whisper "medium" not "large": 95% of the accuracy at 3x the speed for English-only meetings
- Chunk long meetings: For 2hr+ meetings, split audio into 30-minute chunks and process in parallel
- Speaker name mapping: Keep a config file mapping diarization labels (SPEAKER_00) to real names
- Prompt iteration: The summary prompt above is version 14 — expect to iterate based on your team's needs
The complete meeting intelligence system — including calendar integration, automatic recording triggers, and CRM sync — is part of my AI Automation Playbook for SMBs. It covers 12 production-ready automation systems with deployment scripts.
What meeting automation tools are you using? I would love to hear how others are solving the transcription-to-action-items pipeline.
Top comments (0)