As a best-selling author, I invite you to explore my books on Amazon. Don't forget to follow me on Medium and show your support. Thank you! Your support means the world!
Building robust data validation systems in Python requires careful consideration of multiple validation layers and techniques. I have spent considerable time working with various Python validation libraries and frameworks, and I have discovered that combining different approaches creates the most effective validation systems.
Pydantic Models for Type-Safe Validation
Pydantic represents one of the most powerful validation libraries available in Python today. The library uses Python type hints to create validators that automatically catch errors and provide detailed feedback about invalid data. I find Pydantic particularly effective because it validates data at parse time rather than runtime, catching issues before they propagate through the application.
from pydantic import BaseModel, validator, Field, EmailStr
from typing import Optional, List, Union
from datetime import datetime, date
from decimal import Decimal
import re
class ContactInfo(BaseModel):
phone: str = Field(..., regex=r'^\+?1?\d{9,15}$')
email: EmailStr
preferred_contact: str = Field(..., regex=r'^(email|phone|mail)$')
@validator('phone')
def validate_phone_format(cls, v):
# Remove all non-digit characters
cleaned = re.sub(r'\D', '', v)
if len(cleaned) < 10:
raise ValueError('Phone number must contain at least 10 digits')
return v
class BankAccount(BaseModel):
account_number: str = Field(..., min_length=8, max_length=17)
routing_number: str = Field(..., regex=r'^\d{9}$')
account_type: str = Field(..., regex=r'^(checking|savings)$')
balance: Decimal = Field(..., ge=0, decimal_places=2)
@validator('account_number')
def validate_account_checksum(cls, v):
# Implement Luhn algorithm for account validation
def luhn_checksum(card_num):
def digits_of(n):
return [int(d) for d in str(n)]
digits = digits_of(card_num)
odd_digits = digits[-1::-2]
even_digits = digits[-2::-2]
checksum = sum(odd_digits)
for d in even_digits:
checksum += sum(digits_of(d*2))
return checksum % 10
if luhn_checksum(v) != 0:
raise ValueError('Invalid account number checksum')
return v
class Customer(BaseModel):
customer_id: str = Field(..., regex=r'^CUST\d{6}$')
first_name: str = Field(..., min_length=1, max_length=50)
last_name: str = Field(..., min_length=1, max_length=50)
date_of_birth: date
contact_info: ContactInfo
accounts: List[BankAccount] = Field(default=[])
credit_score: Optional[int] = Field(None, ge=300, le=850)
annual_income: Optional[Decimal] = Field(None, ge=0)
employment_status: str = Field(..., regex=r'^(employed|unemployed|retired|student)$')
@validator('date_of_birth')
def validate_age(cls, v):
today = date.today()
age = today.year - v.year - ((today.month, today.day) < (v.month, v.day))
if age < 18:
raise ValueError('Customer must be at least 18 years old')
if age > 120:
raise ValueError('Invalid birth date')
return v
@validator('accounts')
def validate_account_limits(cls, v, values):
if len(v) > 5:
raise ValueError('Customer cannot have more than 5 accounts')
total_balance = sum(account.balance for account in v)
if total_balance > 1000000:
raise ValueError('Total account balance cannot exceed $1,000,000')
return v
class Config:
validate_assignment = True
use_enum_values = True
# Usage with comprehensive error handling
def process_customer_data(customer_data):
try:
customer = Customer(**customer_data)
return {"success": True, "customer": customer}
except ValidationError as e:
error_details = []
for error in e.errors():
field_path = " -> ".join(str(loc) for loc in error['loc'])
error_details.append({
"field": field_path,
"message": error['msg'],
"value": error.get('input', 'N/A')
})
return {"success": False, "errors": error_details}
The validation system I created above demonstrates how Pydantic handles complex business rules while maintaining readability. The library automatically generates JSON schemas from these models, which proves invaluable when building API documentation or frontend validation.
Schema-Based JSON Validation
JSON Schema provides a standardized approach to validating data structures. I frequently use the jsonschema library when working with configuration files or API payloads that require strict validation against predefined schemas.
import jsonschema
from jsonschema import validate, ValidationError, Draft7Validator
import json
# Define comprehensive schema with custom formats
user_schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"user_id": {
"type": "string",
"pattern": "^USR[0-9]{8}$"
},
"profile": {
"type": "object",
"properties": {
"username": {
"type": "string",
"minLength": 3,
"maxLength": 20,
"pattern": "^[a-zA-Z0-9_]+$"
},
"email": {
"type": "string",
"format": "email"
},
"age": {
"type": "integer",
"minimum": 13,
"maximum": 120
},
"preferences": {
"type": "object",
"properties": {
"newsletter": {"type": "boolean"},
"language": {
"type": "string",
"enum": ["en", "es", "fr", "de", "it"]
},
"timezone": {
"type": "string",
"pattern": "^[A-Z][a-z]+/[A-Z][a-z]+$"
}
},
"required": ["newsletter", "language"]
}
},
"required": ["username", "email", "age"],
"additionalProperties": False
},
"permissions": {
"type": "array",
"items": {
"type": "string",
"enum": ["read", "write", "admin", "moderate"]
},
"uniqueItems": True,
"maxItems": 10
},
"metadata": {
"type": "object",
"properties": {
"created_at": {
"type": "string",
"format": "date-time"
},
"last_login": {
"type": "string",
"format": "date-time"
},
"login_count": {
"type": "integer",
"minimum": 0
}
}
}
},
"required": ["user_id", "profile"],
"additionalProperties": False,
"if": {
"properties": {
"profile": {
"properties": {
"age": {"minimum": 18}
}
}
}
},
"then": {
"properties": {
"permissions": {
"contains": {
"enum": ["admin", "moderate"]
}
}
}
}
}
class SchemaValidator:
def __init__(self, schema):
self.schema = schema
self.validator = Draft7Validator(schema)
def validate_data(self, data):
errors = []
for error in self.validator.iter_errors(data):
error_path = " -> ".join(str(item) for item in error.absolute_path)
errors.append({
"path": error_path if error_path else "root",
"message": error.message,
"failed_value": error.instance,
"schema_path": " -> ".join(str(item) for item in error.schema_path)
})
return errors
def is_valid(self, data):
return self.validator.is_valid(data)
def validate_and_raise(self, data):
try:
validate(instance=data, schema=self.schema)
return True
except ValidationError as e:
raise ValueError(f"Validation failed: {e.message}")
# Implementation with detailed error reporting
validator = SchemaValidator(user_schema)
sample_data = {
"user_id": "USR12345678",
"profile": {
"username": "john_doe123",
"email": "john@example.com",
"age": 25,
"preferences": {
"newsletter": True,
"language": "en",
"timezone": "America/New_York"
}
},
"permissions": ["read", "write"],
"metadata": {
"created_at": "2024-01-15T10:30:00Z",
"last_login": "2024-01-20T14:22:00Z",
"login_count": 42
}
}
validation_errors = validator.validate_data(sample_data)
if validation_errors:
for error in validation_errors:
print(f"Error in {error['path']}: {error['message']}")
else:
print("Data validation successful")
This schema-based approach allows me to define complex validation rules including conditional validation, where certain fields become required based on the values of other fields. The error reporting provides precise information about validation failures, making debugging much easier.
Marshmallow for Flexible Serialization
Marshmallow excels at handling serialization, deserialization, and validation in a single framework. I particularly appreciate its ability to transform data during validation, which proves essential when working with APIs that receive data in various formats.
from marshmallow import Schema, fields, validate, validates, validates_schema
from marshmallow import ValidationError, post_load, pre_load
from datetime import datetime, date
import re
class AddressSchema(Schema):
street = fields.Str(required=True, validate=validate.Length(min=5, max=100))
city = fields.Str(required=True, validate=validate.Length(min=2, max=50))
state = fields.Str(required=True, validate=validate.Length(equal=2))
zip_code = fields.Str(required=True, validate=validate.Regexp(r'^\d{5}(-\d{4})?$'))
country = fields.Str(missing='US', validate=validate.OneOf(['US', 'CA', 'MX']))
@validates('state')
def validate_state_code(self, value):
valid_states = [
'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'
]
if value.upper() not in valid_states:
raise ValidationError('Invalid state code')
return value.upper()
class PersonSchema(Schema):
first_name = fields.Str(required=True, validate=validate.Length(min=1, max=50))
last_name = fields.Str(required=True, validate=validate.Length(min=1, max=50))
email = fields.Email(required=True)
phone = fields.Str(required=True)
birth_date = fields.Date(required=True)
social_security = fields.Str(required=True)
address = fields.Nested(AddressSchema, required=True)
emergency_contact = fields.Nested('self', exclude=['emergency_contact'])
@pre_load
def preprocess_data(self, data, **kwargs):
# Clean phone number
if 'phone' in data:
data['phone'] = re.sub(r'\D', '', data['phone'])
# Normalize names
if 'first_name' in data:
data['first_name'] = data['first_name'].strip().title()
if 'last_name' in data:
data['last_name'] = data['last_name'].strip().title()
return data
@validates('phone')
def validate_phone(self, value):
if len(value) != 10:
raise ValidationError('Phone number must be exactly 10 digits')
if not value.isdigit():
raise ValidationError('Phone number must contain only digits')
@validates('social_security')
def validate_ssn(self, value):
# Remove any non-digit characters
ssn_digits = re.sub(r'\D', '', value)
if len(ssn_digits) != 9:
raise ValidationError('SSN must be exactly 9 digits')
# Check for invalid patterns
invalid_patterns = ['000000000', '123456789', '111111111']
if ssn_digits in invalid_patterns:
raise ValidationError('Invalid SSN pattern')
@validates('birth_date')
def validate_age(self, value):
today = date.today()
age = today.year - value.year - ((today.month, today.day) < (value.month, value.day))
if age < 18:
raise ValidationError('Person must be at least 18 years old')
if age > 120:
raise ValidationError('Invalid birth date')
@validates_schema
def validate_emergency_contact(self, data, **kwargs):
if 'emergency_contact' in data and data['emergency_contact']:
emergency = data['emergency_contact']
# Emergency contact cannot be the same person
if (emergency.get('first_name') == data.get('first_name') and
emergency.get('last_name') == data.get('last_name')):
raise ValidationError('Emergency contact cannot be the same person')
@post_load
def create_person(self, data, **kwargs):
# Format SSN for storage
if 'social_security' in data:
ssn = data['social_security']
ssn_digits = re.sub(r'\D', '', ssn)
data['social_security'] = f"{ssn_digits[:3]}-{ssn_digits[3:5]}-{ssn_digits[5:]}"
# Format phone for storage
if 'phone' in data:
phone = data['phone']
data['phone'] = f"({phone[:3]}) {phone[3:6]}-{phone[6:]}"
return data
# Usage with comprehensive error handling
def validate_person_data(person_data):
schema = PersonSchema()
try:
result = schema.load(person_data)
return {"success": True, "data": result}
except ValidationError as err:
formatted_errors = {}
for field, messages in err.messages.items():
if isinstance(messages, list):
formatted_errors[field] = messages
else:
formatted_errors[field] = [messages]
return {"success": False, "errors": formatted_errors}
# Example with nested validation
sample_person = {
"first_name": " john ",
"last_name": " DOE ",
"email": "john.doe@example.com",
"phone": "(555) 123-4567",
"birth_date": "1990-05-15",
"social_security": "123-45-6789",
"address": {
"street": "123 Main Street",
"city": "Anytown",
"state": "ca",
"zip_code": "12345",
"country": "US"
},
"emergency_contact": {
"first_name": "Jane",
"last_name": "Smith",
"email": "jane.smith@example.com",
"phone": "5551234568",
"birth_date": "1985-03-20",
"social_security": "987-65-4321",
"address": {
"street": "456 Oak Avenue",
"city": "Another City",
"state": "NY",
"zip_code": "54321"
}
}
}
result = validate_person_data(sample_person)
Marshmallow's preprocessing and postprocessing capabilities allow me to clean and format data automatically during validation. This eliminates the need for separate data cleaning steps and ensures consistent data formatting throughout the application.
Cerberus for Dynamic Validation
Cerberus provides a lightweight alternative for validation scenarios that require dynamic rule generation. I find it particularly useful when validation rules need to change based on runtime conditions or user permissions.
from cerberus import Validator, TypeDefinition
from datetime import datetime, date
import re
class CustomValidator(Validator):
def _validate_is_adult(self, is_adult, field, value):
"""Validate that the person is an adult based on birth date"""
if is_adult:
today = date.today()
if isinstance(value, str):
birth_date = datetime.strptime(value, '%Y-%m-%d').date()
else:
birth_date = value
age = today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
if age < 18:
self._error(field, f"Person must be at least 18 years old. Current age: {age}")
def _validate_unique_in_list(self, unique_in_list, field, value):
"""Validate that items in a list are unique"""
if unique_in_list and isinstance(value, list):
if len(value) != len(set(value)):
self._error(field, "All items in the list must be unique")
def _validate_credit_card(self, is_credit_card, field, value):
"""Validate credit card number using Luhn algorithm"""
if is_credit_card:
def luhn_check(card_number):
def digits_of(n):
return [int(d) for d in str(n)]
digits = digits_of(card_number)
odd_digits = digits[-1::-2]
even_digits = digits[-2::-2]
checksum = sum(odd_digits)
for d in even_digits:
checksum += sum(digits_of(d*2))
return checksum % 10 == 0
card_digits = re.sub(r'\D', '', str(value))
if not luhn_check(card_digits):
self._error(field, "Invalid credit card number")
def _validate_business_hours(self, business_hours, field, value):
"""Validate time is within business hours"""
if business_hours:
if isinstance(value, str):
time_obj = datetime.strptime(value, '%H:%M').time()
else:
time_obj = value
start_time = datetime.strptime('09:00', '%H:%M').time()
end_time = datetime.strptime('17:00', '%H:%M').time()
if not (start_time <= time_obj <= end_time):
self._error(field, "Time must be within business hours (9:00 AM - 5:00 PM)")
# Define comprehensive validation schemas
user_schema = {
'user_id': {
'type': 'string',
'required': True,
'regex': r'^USER\d{6}$'
},
'personal_info': {
'type': 'dict',
'required': True,
'schema': {
'first_name': {
'type': 'string',
'required': True,
'minlength': 1,
'maxlength': 50,
'regex': r'^[a-zA-Z\s]+$'
},
'last_name': {
'type': 'string',
'required': True,
'minlength': 1,
'maxlength': 50,
'regex': r'^[a-zA-Z\s]+$'
},
'birth_date': {
'type': 'date',
'required': True,
'is_adult': True
},
'email': {
'type': 'string',
'required': True,
'regex': r'^[^@]+@[^@]+\.[^@]+$'
},
'phone': {
'type': 'string',
'required': True,
'regex': r'^\+?1?\d{10}$'
}
}
},
'financial_info': {
'type': 'dict',
'required': False,
'schema': {
'credit_cards': {
'type': 'list',
'maxlength': 5,
'schema': {
'type': 'string',
'credit_card': True
},
'unique_in_list': True
},
'annual_income': {
'type': 'number',
'min': 0,
'max': 10000000
},
'employment_status': {
'type': 'string',
'allowed': ['employed', 'unemployed', 'self_employed', 'retired', 'student']
}
}
},
'preferences': {
'type': 'dict',
'required': False,
'schema': {
'contact_time': {
'type': 'string',
'regex': r'^\d{2}:\d{2}$',
'business_hours': True
},
'communication_methods': {
'type': 'list',
'allowed': ['email', 'phone', 'sms', 'mail'],
'unique_in_list': True
},
'language': {
'type': 'string',
'allowed': ['en', 'es', 'fr', 'de', 'it', 'pt']
}
}
},
'metadata': {
'type': 'dict',
'required': False,
'schema': {
'created_at': {'type': 'datetime'},
'last_updated': {'type': 'datetime'},
'version': {'type': 'integer', 'min': 1}
}
}
}
# Dynamic schema generation based on user role
def generate_schema_for_role(role):
base_schema = user_schema.copy()
if role == 'admin':
base_schema['admin_notes'] = {
'type': 'string',
'maxlength': 1000
}
base_schema['permissions'] = {
'type': 'list',
'allowed': ['read', 'write', 'delete', 'admin'],
'unique_in_list': True
}
elif role == 'employee':
base_schema['employee_id'] = {
'type': 'string',
'required': True,
'regex': r'^EMP\d{5}$'
}
base_schema['department'] = {
'type': 'string',
'required': True,
'allowed': ['sales', 'marketing', 'engineering', 'hr', 'finance']
}
return base_schema
# Validation with dynamic rules
def validate_user_data(data, user_role='standard'):
schema = generate_schema_for_role(user_role)
validator = CustomValidator(schema)
if validator.validate(data):
return {"success": True, "data": validator.normalized(data)}
else:
return {"success": False, "errors": validator.errors}
# Example usage with role-based validation
admin_data = {
'user_id': 'USER123456',
'personal_info': {
'first_name': 'John',
'last_name': 'Doe',
'birth_date': '1985-06-15',
'email': 'john.doe@company.com',
'phone': '+15551234567'
},
'financial_info': {
'credit_cards': ['4532015112830366', '5555555555554444'],
'annual_income': 75000,
'employment_status': 'employed'
},
'preferences': {
'contact_time': '14:30',
'communication_methods': ['email', 'phone'],
'language': 'en'
},
'admin_notes': 'High-value customer with excellent payment history',
'permissions': ['read', 'write', 'admin']
}
result = validate_user_data(admin_data, 'admin')
The dynamic nature of Cerberus allows me to create validation systems that adapt to different contexts. This proves particularly valuable in multi-tenant applications where validation rules vary between organizations or user types.
Database Constraint Validation
Database-level validation provides the final layer of data integrity protection. I implement database constraints using SQLAlchemy to ensure that even if application-level validation fails, the database maintains consistency.
python
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Decimal
from sqlalchemy import ForeignKey, CheckConstraint, UniqueConstraint, Index
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, validates, sessionmaker
from sqlalchemy.sql import func
from datetime import datetime
import re
Base = declarative_base()
class Customer(Base):
__tablename__ = 'customers'
id = Column(Integer, primary_key=True)
customer_code = Column(String(20), unique=True, nullable=False)
first_name = Column(String(50), nullable=False)
last_name = Column(String(50), nullable=False)
email = Column(String(255), unique=True, nullable=False)
phone = Column(String(20), nullable=False)
date_of_birth = Column(DateTime, nullable=False)
created_at = Column(DateTime, default=func.now())
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
# Relationships
accounts = relationship("Account", back_populates="customer", cascade="all, delete-orphan")
transactions = relationship("Transaction", back_populates="customer")
# Database constraints
__table_args__ = (
CheckConstraint('length(first_name) >= 1', name='check_first_name_length'),
CheckConstraint('length(last_name) >= 1', name='check_last_name_length'),
CheckConstraint('email LIKE "%@%"', name='check_email_format'),
CheckConstraint('date_of_birth < datetime("now")', name='check_birth_date_past'),
Index('idx_customer_name', 'last_name', 'first_name'),
Index('idx_customer_email', 'email'),
)
@validates('email')
def validate_email(self, key, email):
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
if not re.match(pattern, email):
raise ValueError('Invalid email format')
return email.lower()
@validates('customer_code')
def validate_customer_code(self, key, code):
if not re.match(r'^CUST\d{6}$', code):
raise ValueError('Customer code must follow format CUST######')
return code
@validates('phone')
def validate_phone(self, key, phone):
# Remove all non-digit characters
digits = re.sub(r'\D', '', phone)
if len(digits) != 10:
raise ValueError('Phone number must contain exactly 10 digits')
return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
class Account(Base):
__tablename__ = 'accounts'
id = Column(Integer, primary_key=True)
account_number = Column(String(20), unique=True, nullable=False)
account_type = Column(String(20), nullable=False)
balance = Column(Decimal(15, 2), nullable=False, default=0)
customer_id = Column(Integer, ForeignKey('customers.id'), nullable=False)
opened_date = Column(DateTime, default=func.now())
status = Column(String(20), default='active')
# Relationships
customer = relationship("Customer", back_populates="accounts")
transactions = relationship("Transaction", back_populates="account")
# Database constraints
__table_args__ = (
CheckConstraint('balance >= 0', name='check_positive_balance'),
CheckConstraint('account_type IN ("checking", "savings", "credit")', name='check_account_type'),
CheckConstraint('status IN ("active", "closed", "frozen")', name='check_account_status'),
CheckConstraint('length(account_number) >= 10', name='check_account_number_length'),
Index('idx_account_customer', 'customer_id'),
Index('idx_account_number', 'account_number'),
)
@validates('account_number')
def validate_account_number(self, key, number):
if not re.match(r'^\d{10,20}$', number):
raise ValueError('Account number must be 10-20 digits')
return number
@validates('balance')
def validate_balance(self, key, balance):
if balance < 0:
raise ValueError('Account balance cannot be negative')
return balance
class Transaction(Base):
__tablename__ = 'transactions'
id = Column(Integer, primary_key=True)
transaction_id = Column(String(50), unique=True, nullable=False)
account_id = Column(Integer, ForeignKey('accounts.id'), nullable=False)
customer_id = Column(Integer, ForeignKey('customers.id'), nullable=False)
transaction_type = Column(String(20), nullable=False)
amount = Column(Decimal(15, 2), nullable=False)
description = Column(String(255))
transaction_date = Column(DateTime, default=func.now())
status = Column(String(20), default='pending
---
## 101 Books
**101 Books** is an AI-driven publishing company co-founded by author **Aarav Joshi**. By leveraging advanced AI technology, we keep our publishing costs incredibly low—some books are priced as low as **$4**—making quality knowledge accessible to everyone.
Check out our book **[Golang Clean Code](https://www.amazon.com/dp/B0DQQF9K3Z)** available on Amazon.
Stay tuned for updates and exciting news. When shopping for books, search for **Aarav Joshi** to find more of our titles. Use the provided link to enjoy **special discounts**!
## Our Creations
Be sure to check out our creations:
**[Investor Central](https://www.investorcentral.co.uk/)** | **[Investor Central Spanish](https://spanish.investorcentral.co.uk/)** | **[Investor Central German](https://german.investorcentral.co.uk/)** | **[Smart Living](https://smartliving.investorcentral.co.uk/)** | **[Epochs & Echoes](https://epochsandechoes.com/)** | **[Puzzling Mysteries](https://www.puzzlingmysteries.com/)** | **[Hindutva](http://hindutva.epochsandechoes.com/)** | **[Elite Dev](https://elitedev.in/)** | **[JS Schools](https://jsschools.com/)**
---
### We are on Medium
**[Tech Koala Insights](https://techkoalainsights.com/)** | **[Epochs & Echoes World](https://world.epochsandechoes.com/)** | **[Investor Central Medium](https://medium.investorcentral.co.uk/)** | **[Puzzling Mysteries Medium](https://medium.com/puzzling-mysteries)** | **[Science & Epochs Medium](https://science.epochsandechoes.com/)** | **[Modern Hindutva](https://modernhindutva.substack.com/)**
Top comments (0)