DEV Community

agenthustler
agenthustler

Posted on

How to Build a Personal Finance Aggregator with Screen Scraping in Python

Personal finance aggregators like Mint and Plaid connect to your bank accounts and show you everything in one place. While you cannot replicate Plaid's bank partnerships, you can build a personal aggregator using screen scraping and CSV imports for your own financial data.

The Architecture

Bank CSV Exports / Screen Scraping --> Normalizer --> SQLite DB --> Analytics
Enter fullscreen mode Exit fullscreen mode

Setting Up

pip install requests beautifulsoup4 pandas sqlite-utils playwright
playwright install chromium
Enter fullscreen mode Exit fullscreen mode

The Aggregator Framework

import sqlite_utils
import pandas as pd
from datetime import datetime
import hashlib

class FinanceAggregator:
    def __init__(self, db_path: str = "finances.db"):
        self.db = sqlite_utils.Database(db_path)
        self._init_tables()

    def _init_tables(self):
        if "transactions" not in self.db.table_names():
            self.db["transactions"].create({
                "id": str, "date": str, "description": str,
                "amount": float, "category": str,
                "account": str, "source": str, "imported_at": str
            }, pk="id")

    def add_transactions(self, transactions: list[dict], source: str):
        for tx in transactions:
            tx["source"] = source
            tx["imported_at"] = datetime.utcnow().isoformat()
            tx["id"] = hashlib.sha256(
                f"{tx['date']}_{tx['description']}_{tx['amount']}_{source}".encode()
            ).hexdigest()[:16]

        self.db["transactions"].upsert_all(transactions, pk="id")
        print(f"Imported {len(transactions)} transactions from {source}")
Enter fullscreen mode Exit fullscreen mode

CSV Import for Bank Exports

Most banks let you export CSVs. Build importers for common formats:

def import_bank_csv(file_path: str, bank_format: str = "chase") -> list[dict]:
    df = pd.read_csv(file_path)

    format_mappings = {
        "chase": {"date_col": "Transaction Date", "desc_col": "Description",
                   "amount_col": "Amount", "category_col": "Category"},
        "bofa": {"date_col": "Date", "desc_col": "Payee",
                  "amount_col": "Amount", "category_col": None},
        "amex": {"date_col": "Date", "desc_col": "Description",
                  "amount_col": "Amount", "category_col": "Category"},
    }

    fmt = format_mappings.get(bank_format)
    if not fmt:
        raise ValueError(f"Unknown format: {bank_format}")

    transactions = []
    for _, row in df.iterrows():
        tx = {
            "date": pd.to_datetime(row[fmt["date_col"]]).strftime("%Y-%m-%d"),
            "description": str(row[fmt["desc_col"]]),
            "amount": float(row[fmt["amount_col"]]),
            "category": str(row.get(fmt["category_col"], "Uncategorized")) if fmt["category_col"] else "Uncategorized"
        }
        transactions.append(tx)
    return transactions
Enter fullscreen mode Exit fullscreen mode

Auto-Categorization

CATEGORY_RULES = {
    "Groceries": ["walmart", "trader joe", "whole foods", "kroger", "costco"],
    "Restaurants": ["doordash", "ubereats", "grubhub", "starbucks", "chipotle"],
    "Transportation": ["uber", "lyft", "shell", "chevron", "parking"],
    "Subscriptions": ["netflix", "spotify", "hulu", "adobe", "youtube"],
    "Utilities": ["electric", "water", "internet", "verizon", "att"],
}

def categorize_transaction(description: str) -> str:
    desc_lower = description.lower()
    for category, keywords in CATEGORY_RULES.items():
        if any(kw in desc_lower for kw in keywords):
            return category
    return "Other"

def auto_categorize(aggregator: FinanceAggregator):
    uncategorized = list(aggregator.db["transactions"].rows_where(
        "category = ?", ["Uncategorized"]
    ))
    updated = 0
    for tx in uncategorized:
        category = categorize_transaction(tx["description"])
        if category != "Other":
            aggregator.db["transactions"].update(tx["id"], {"category": category})
            updated += 1
    print(f"Auto-categorized {updated}/{len(uncategorized)} transactions")
Enter fullscreen mode Exit fullscreen mode

Spending Analytics

def monthly_report(aggregator: FinanceAggregator, year: int, month: int):
    start = f"{year}-{month:02d}-01"
    end = f"{year}-{month:02d}-31"

    txs = list(aggregator.db["transactions"].rows_where(
        "date >= ? AND date <= ? AND amount < 0", [start, end]
    ))
    df = pd.DataFrame(txs)

    if df.empty:
        print("No transactions found")
        return

    df["amount"] = df["amount"].abs()
    by_category = df.groupby("category")["amount"].agg(["sum", "count", "mean"])
    by_category = by_category.sort_values("sum", ascending=False)
    total = df["amount"].sum()

    print(f"Monthly Report: {year}-{month:02d}")
    print(f"Total spending: ${total:,.2f}")
    print(f"Transactions: {len(df)}")
    for cat, row in by_category.iterrows():
        pct = (row['sum'] / total) * 100
        print(f"  {cat}: ${row['sum']:,.2f} ({pct:.1f}%)")

# Usage
agg = FinanceAggregator()
txs = import_bank_csv("chase_export.csv", "chase")
agg.add_transactions(txs, "chase_checking")
auto_categorize(agg)
monthly_report(agg, 2026, 3)
Enter fullscreen mode Exit fullscreen mode

Screen Scraping for Portals Without Export

For banking portals that require JavaScript rendering, use Playwright with ScraperAPI proxy rotation for reliable access. ThorData residential proxies prevent blocks on financial sites. Monitor your pipelines with ScrapeOps.

Security Notes

  • Store credentials in environment variables or a keychain, never in code
  • Use encrypted SQLite or SQLCipher for the database
  • Run on your local machine only, never on shared servers
  • This is for YOUR OWN accounts only

Building your own finance aggregator gives you complete control over your data and analytics. No subscriptions, no data sharing with third parties.

Top comments (0)