You get a CSV export from a client. It's a mess:
- Mixed encodings (some fields are Latin-1, others UTF-8)
- Dates in 5 different formats
- "NULL", "N/A", "None", "-", and empty strings all meaning the same thing
- Trailing whitespace everywhere
- 200 duplicate rows
Your first instinct: pip install pandas. But do you really need a 50MB dependency for this?
Here's how to clean CSV data using only Python's standard library.
1. Detect Encoding Automatically
def detect_encoding(file_path: str) -> str:
"""Check for BOM markers, then try common encodings."""
with open(file_path, 'rb') as f:
raw = f.read(4096)
# Check BOM
if raw.startswith(b'\xef\xbb\xbf'):
return 'utf-8-sig'
if raw.startswith(b'\xff\xfe'):
return 'utf-16-le'
# Try encodings in order
for enc in ['utf-8', 'ascii', 'latin-1', 'cp1252', 'shift-jis']:
try:
raw.decode(enc)
return enc
except (UnicodeDecodeError, LookupError):
continue
return 'utf-8' # fallback
No chardet needed for 95% of real-world files.
2. Normalize Null Values
The wild variety of "nothing" representations in CSVs is staggering:
NULL_VARIANTS = {
'', 'null', 'none', 'n/a', 'na', 'nan', '#n/a',
'-', '--', '.', 'missing', 'undefined', '#ref!', '#value!'
}
def normalize_null(value: str) -> str:
"""Convert all null-like values to empty string."""
if value.strip().lower() in NULL_VARIANTS:
return ''
return value
3. Auto-Detect Date Formats
Instead of trying every format on every value, detect the format once per column:
import re
from datetime import datetime
DATE_PATTERNS = [
(r'\d{4}-\d{2}-\d{2}', '%Y-%m-%d'),
(r'\d{2}/\d{2}/\d{4}', '%m/%d/%Y'),
(r'\d{2}-\d{2}-\d{4}', '%m-%d-%Y'),
(r'\d{2}\.\d{2}\.\d{4}', '%d.%m.%Y'),
(r'\d{4}/\d{2}/\d{2}', '%Y/%m/%d'),
]
def detect_date_format(values: list[str]) -> str | None:
"""Sample values to find the date format for a column."""
sample = [v for v in values if v.strip()][:20]
for pattern, fmt in DATE_PATTERNS:
matches = sum(1 for v in sample if re.fullmatch(pattern, v.strip()))
if matches > len(sample) * 0.5:
return fmt
return None
4. Remove Duplicates Without Loading Everything
For really large files, you can hash rows instead of comparing them all:
import hashlib
def deduplicate_rows(rows: list[list[str]]) -> list[list[str]]:
seen = set()
unique = []
for row in rows:
row_hash = hashlib.md5(','.join(row).encode()).hexdigest()
if row_hash not in seen:
seen.add(row_hash)
unique.append(row)
return unique
5. Standardize Column Names
import re
def standardize_name(name: str) -> str:
"""Convert any column name to clean snake_case."""
name = name.strip().lower()
name = re.sub(r'[^a-z0-9]+', '_', name)
name = name.strip('_')
return name or 'unnamed'
"Customer First Name " → customer_first_name
"Revenue ($)" → revenue
6. Put It All Together
import csv
def clean_csv(input_path: str, output_path: str):
encoding = detect_encoding(input_path)
with open(input_path, 'r', encoding=encoding) as f:
reader = csv.reader(f)
headers = [standardize_name(h) for h in next(reader)]
rows = list(reader)
# Clean each cell
for row in rows:
for i in range(len(row)):
row[i] = row[i].strip() # whitespace
row[i] = normalize_null(row[i]) # nulls
# Deduplicate
original_count = len(rows)
rows = deduplicate_rows(rows)
print(f"Removed {original_count - len(rows)} duplicates")
# Normalize dates per column
for col_idx in range(len(headers)):
col_values = [row[col_idx] for row in rows if col_idx < len(row)]
date_fmt = detect_date_format(col_values)
if date_fmt:
for row in rows:
if col_idx < len(row) and row[col_idx]:
try:
dt = datetime.strptime(row[col_idx], date_fmt)
row[col_idx] = dt.strftime('%Y-%m-%d')
except ValueError:
pass
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(rows)
print(f"Cleaned: {len(rows)} rows, {len(headers)} columns → {output_path}")
The Complete Tool
I packaged all of this (plus type detection, outlier flagging, and column profiling) into a single-file Python tool:
CSV Cleaner — one command, clean data.
python csv_cleaner.py messy_data.csv -o clean_data.csv
It's free, open source, and has zero external dependencies.
The Pro version ($19) adds smart fill, fuzzy dedup, custom validation rules, batch processing, and HTML reports.
Part of the Vesper Developer Toolkit — open source CLI tools that just work.
Top comments (0)