August 2, 2026, is when the EU AI Act becomes fully enforceable for high-risk AI applications.
If your team hasn't started: this post is the engineering implementation guide for the four technical requirements — audit logging, explainability, bias monitoring, and data lineage, all with code.
The regulation applies to you if your AI touches EU users and operates in healthcare, fintech, HR tech, legal tech, or education — regardless of where your company is incorporated.
Requirement 1: Comprehensive Audit Logging
Every AI decision in a high-risk system must be logged: inputs, outputs, context, model version, timestamp. For every decision. Queryable. 5-year minimum retention.
import hashlib, json, uuid
from datetime import datetime
from dataclasses import dataclass, field, asdict
from typing import Any, Optional
@dataclass
class AIDecisionRecord:
decision_id: str = field(default_factory=lambda: str(uuid.uuid4()))
timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat())
model_id: str = ""
model_version: str = ""
feature_name: str = "" # which product feature made this call
risk_category: str = "" # e.g. "high_risk_hr", "high_risk_fintech"
input_hash: str = "" # SHA-256 of raw input — audit trail without storing PII
input_type_summary: str = "" # e.g. "cv_document:3420_chars"
output_summary: str = "" # brief of what was decided/returned
confidence: Optional[float] = None
user_id_pseudonym: str = "" # pseudonymised — GDPR + AI Act compatible
explanation_logged: bool = False
class EUAIActAuditLogger:
"""
Logs AI decisions in EU AI Act-compliant format.
Back end: any append-only, tamper-evident store (CloudWatch, BigQuery, etc.)
"""
def __init__(self, storage):
self.storage = storage # inject your storage implementation
def log(
self,
model_id: str,
model_version: str,
feature_name: str,
risk_category: str,
raw_input: Any,
output_summary: str,
user_id: str,
confidence: Optional[float] = None,
explanation: Optional[str] = None,
) -> str:
record = AIDecisionRecord(
model_id=model_id,
model_version=model_version,
feature_name=feature_name,
risk_category=risk_category,
input_hash=hashlib.sha256(
json.dumps(raw_input, default=str).encode()
).hexdigest(),
input_type_summary=self._type_summary(raw_input),
output_summary=output_summary[:500], # cap length in log
confidence=confidence,
user_id_pseudonym=hashlib.sha256(
f"ailoitte_salt_{user_id}".encode()
).hexdigest()[:16],
explanation_logged=explanation is not None,
)
self.storage.append(asdict(record))
return record.decision_id
def _type_summary(self, raw_input: Any) -> str:
if isinstance(raw_input, str):
return f"text:{len(raw_input)}_chars"
if isinstance(raw_input, dict):
return f"structured:{list(raw_input.keys())[:5]}"
return f"type:{type(raw_input).__name__}"
def compliance_report(self, start: str, end: str) -> dict:
"""Pull audit log slice for regulatory inspection."""
records = self.storage.query_range(start, end)
return {
"period": {"from": start, "to": end},
"total_decisions": len(records),
"by_feature": self._group(records, "feature_name"),
"by_risk_category": self._group(records, "risk_category"),
"explanation_coverage_pct": round(
sum(1 for r in records if r["explanation_logged"]) / max(len(records), 1) * 100, 1
),
}
def _group(self, records, key):
from collections import Counter
return dict(Counter(r[key] for r in records))
Key point: Build this before shipping any production AI decision feature. Retrofitting it into an existing system typically costs 3–4× more engineering time.
Requirement 2: Explainability
You must be able to explain any high-risk AI output in human-readable terms on regulatory request. Two approaches depending on model type:
For structured ML (gradient boosting, tabular neural networks):
import shap, numpy as np
class StructuredMLExplainer:
def __init__(self, model, feature_names: list[str]):
self.model = model
self.features = feature_names
self.explainer = None
def fit(self, background_data: np.ndarray):
self.explainer = shap.TreeExplainer(self.model)
def explain(self, input_row: np.ndarray, top_n: int = 5) -> dict:
vals = self.explainer.shap_values(input_row)
if isinstance(vals, list):
vals = vals[1] # positive class for binary classifiers
flat = vals[0] if vals.ndim > 1 else vals
ranked = sorted(zip(self.features, flat), key=lambda x: abs(x[1]), reverse=True)[:top_n]
return {
"top_factors": [
{
"factor": name,
"direction": "increased risk" if v > 0 else "decreased risk",
"magnitude": "high" if abs(v) > 0.10 else "medium" if abs(v) > 0.04 else "low",
}
for name, v in ranked
],
"plain_english": self._plain(ranked),
"compliance_basis": "EU AI Act Art. 13 — generated via SHAP feature attribution",
}
def _plain(self, ranked):
pos = [n for n, v in ranked if v > 0][:2]
neg = [n for n, v in ranked if v < 0][:2]
parts = []
if pos: parts.append(f"Factors that raised this score: {', '.join(pos)}")
if neg: parts.append(f"Factors that lowered this score: {', '.join(neg)}")
return ". ".join(parts)
For LLM-based systems (RAG, generative AI):
SHAP doesn't apply to transformer models. The current practical approach: log the context that produced the output, so you can say "this decision was based on these retrieved documents."
@dataclass
class LLMDecisionProvenance:
"""
Stores the 'why' for an LLM decision — the closest EU AI Act-compliant
explainability for generative systems currently available.
"""
decision_id: str
model_id: str
prompt_template_id: str # which prompt template was used
retrieved_sources: list[dict] # [{"doc_id": ..., "title": ..., "excerpt": ...}]
user_query_hash: str # hash of original user query
output_length: int
timestamp: str
def to_explanation(self) -> str:
sources = ", ".join(s["title"] for s in self.retrieved_sources[:3])
return (
f"This output was generated based on the following internal sources: {sources}. "
f"It used prompt template '{self.prompt_template_id}'. "
f"Full source text available in audit log reference {self.decision_id}."
)
Requirement 3: Bias Monitoring Pipeline
Ongoing monitoring across protected characteristics — not just a pre-launch test.
from dataclasses import dataclass
from datetime import datetime
from collections import defaultdict
from typing import Optional
@dataclass
class BiasEvaluationReport:
model_id: str
model_version: str
eval_date: str
characteristic: str
group_positive_rates: dict
disparity_ratio: float # min_rate / max_rate (1.0 = perfect parity)
threshold: float # 0.80 = EU standard (80% rule)
compliant: bool
action_required: Optional[str]
class BiasMonitor:
"""Run after every model update and on a monthly schedule in production."""
EU_PROTECTED = ["gender", "age_group", "ethnicity", "disability_status", "religion"]
DISPARITY_THRESHOLD = 0.80 # 80% rule — industry standard, aligns with EU law
def evaluate(
self,
model_id: str,
model_version: str,
predictions: list[int],
demographics: dict[str, list], # {"gender": [...], "age_group": [...]}
) -> list[BiasEvaluationReport]:
reports = []
for char in self.EU_PROTECTED:
if char not in demographics:
continue
groups = demographics[char]
rates = defaultdict(list)
for pred, group in zip(predictions, groups):
if group:
rates[group].append(pred)
group_positive_rates = {
g: round(sum(preds) / len(preds), 4)
for g, preds in rates.items()
if preds
}
if len(group_positive_rates) < 2:
continue
vals = list(group_positive_rates.values())
ratio = round(min(vals) / max(vals), 4) if max(vals) > 0 else 1.0
compliant = ratio >= self.DISPARITY_THRESHOLD
reports.append(BiasEvaluationReport(
model_id=model_id,
model_version=model_version,
eval_date=datetime.utcnow().isoformat(),
characteristic=char,
group_positive_rates=group_positive_rates,
disparity_ratio=ratio,
threshold=self.DISPARITY_THRESHOLD,
compliant=compliant,
action_required=None if compliant else "BIAS_REMEDIATION_REQUIRED — file incident report",
))
return reports
def regulatory_summary(self, reports: list[BiasEvaluationReport]) -> dict:
return {
"evaluation_date": datetime.utcnow().isoformat(),
"overall_status": "COMPLIANT" if all(r.compliant for r in reports) else "NON_COMPLIANT",
"characteristics_evaluated": len(reports),
"characteristics_passing": sum(1 for r in reports if r.compliant),
"details": [
{
"characteristic": r.characteristic,
"disparity_ratio": r.disparity_ratio,
"status": "PASS" if r.compliant else "FAIL",
"action": r.action_required,
}
for r in reports
],
"next_scheduled_evaluation": "30 days",
"standard_applied": "EU AI Act Annex IV + 80% disparate impact rule",
}
Requirement 4: Data Lineage Documentation
Maintain a data card for every training dataset. Minimum fields:
@dataclass
class DataCard:
dataset_name: str
dataset_version: str
source_description: str
collection_date_range: str
total_records: int
demographic_composition: dict # known breakdown by protected characteristics
known_biases: list[str] # documented gaps or known issues
quality_assessment: str
usage_restrictions: list[str]
last_updated: str
maintained_by: str
# Example — fills in the EU AI Act Annex IV data documentation requirement
cv_screening_training_data = DataCard(
dataset_name="cv_screening_v3",
dataset_version="3.2.1",
source_description="Historical hiring decisions from 2018–2023, 6 partner companies",
collection_date_range="2018-01-01 to 2023-12-31",
total_records=84_000,
demographic_composition={
"gender": {"male": 0.62, "female": 0.35, "other/unknown": 0.03},
"age_group": {"18-30": 0.28, "31-45": 0.41, "46+": 0.31},
"ethnicity": "not collected — gap documented",
},
known_biases=[
"Gender imbalance in technical roles (62% male in engineering data)",
"Ethnicity data not collected — bias in this dimension unmeasurable",
"Seniority overrepresentation from partner companies in financial services",
],
quality_assessment="Reviewed 2024-03. Manual spot-check of 500 records. Outlier rate: 2.1%.",
usage_restrictions=["HR screening models only", "Not for compensation decisions"],
last_updated="2024-03-15",
maintained_by="ML Platform Team",
)
Implementation Order
Week 1: Audit logging
Build the logging layer FIRST — everything else depends on it.
Design the schema. Hook into every AI decision path. Set up storage.
Week 2: Explainability
Structured ML → SHAP integration + plain-English generator
LLM systems → provenance logging + source attribution
Week 3: Bias monitoring
Build evaluation pipeline. Define acceptable thresholds.
Set up monthly scheduled runs + alerting on threshold breach.
Week 4: Data lineage
Write data cards for all training datasets.
Set up update procedures for future dataset changes.
Weeks 5–6: Internal review + documentation
EU AI Act Article 11 technical documentation.
Conformity assessment (required for highest-risk systems).
Registration in EU database if required for your category.
One question I'm genuinely still working through: for LLM-based high-risk systems, is provenance logging (the retrieved sources approach above) what regulators will actually accept as "explainability," or is there a more rigorous approach emerging? Curious what others building in regulated spaces have encountered.
Sunil — CEO, Ailoitte. We build EU AI Act compliance infrastructure for funded startups. 4 week delivery. ailoitte.com
Top comments (0)