DEV Community

Propfirmkey
Propfirmkey

Posted on

Web Scraping Financial Data with Python: Best Practices

Accessing financial data programmatically is essential for any quant trader. Here's how to do it responsibly and efficiently.

Data Sources

Free APIs

import requests

# Yahoo Finance (unofficial)
def get_stock_data(symbol, period='1mo'):
    url = f'https://query1.finance.yahoo.com/v8/finance/chart/{symbol}'
    params = {'range': period, 'interval': '1d'}
    r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    return r.json()

# CoinGecko (crypto)
def get_crypto_price(coin_id):
    url = f'https://api.coingecko.com/api/v3/simple/price'
    params = {'ids': coin_id, 'vs_currencies': 'usd'}
    return requests.get(url).json()
Enter fullscreen mode Exit fullscreen mode

Economic Calendar

from datetime import datetime

def get_forex_factory_events(date=None):
    """Parse economic events - be respectful of rate limits"""
    if date is None:
        date = datetime.now().strftime('%Y-%m-%d')
    # Use the ForexFactory RSS or API
    # Always add delays between requests
    pass
Enter fullscreen mode Exit fullscreen mode

Best Practices

1. Respect Rate Limits

import time

class RateLimiter:
    def __init__(self, calls_per_second=1):
        self.delay = 1.0 / calls_per_second
        self.last_call = 0

    def wait(self):
        elapsed = time.time() - self.last_call
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)
        self.last_call = time.time()

limiter = RateLimiter(calls_per_second=2)
Enter fullscreen mode Exit fullscreen mode

2. Cache Results

import json
import hashlib
from pathlib import Path

CACHE_DIR = Path('cache')
CACHE_DIR.mkdir(exist_ok=True)

def cached_request(url, ttl_hours=1):
    cache_key = hashlib.md5(url.encode()).hexdigest()
    cache_file = CACHE_DIR / f'{cache_key}.json'

    if cache_file.exists():
        data = json.loads(cache_file.read_text())
        age_hours = (time.time() - data['timestamp']) / 3600
        if age_hours < ttl_hours:
            return data['content']

    response = requests.get(url)
    cache_data = {
        'timestamp': time.time(),
        'content': response.json()
    }
    cache_file.write_text(json.dumps(cache_data))
    return cache_data['content']
Enter fullscreen mode Exit fullscreen mode

3. Handle Failures Gracefully

from tenacity import retry, stop_after_attempt, wait_exponential

@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
def fetch_with_retry(url):
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return response.json()
Enter fullscreen mode Exit fullscreen mode

4. Store Data Properly

import sqlite3
import pandas as pd

def store_ohlcv(df, symbol, db_path='market_data.db'):
    conn = sqlite3.connect(db_path)
    df['symbol'] = symbol
    df.to_sql('ohlcv', conn, if_exists='append', index=False)
    conn.close()
Enter fullscreen mode Exit fullscreen mode

Building a Data Pipeline

class MarketDataPipeline:
    def __init__(self, symbols):
        self.symbols = symbols
        self.limiter = RateLimiter(2)
        self.db = sqlite3.connect('market_data.db')

    def run_daily(self):
        for symbol in self.symbols:
            self.limiter.wait()
            try:
                data = self.fetch(symbol)
                self.store(symbol, data)
            except Exception as e:
                print(f'Error {symbol}: {e}')
                continue

    def fetch(self, symbol):
        return cached_request(
            f'https://api.example.com/v1/bars/{symbol}',
            ttl_hours=4
        )

    def store(self, symbol, data):
        df = pd.DataFrame(data)
        store_ohlcv(df, symbol)
Enter fullscreen mode Exit fullscreen mode

Legal Considerations

  • Always check the site's robots.txt and ToS
  • Use official APIs when available
  • Don't overload servers
  • Cache aggressively to reduce requests
  • Consider paid data providers for production use

Financial data is the foundation of any trading analysis. Whether you're building indicators, backtesting strategies, or comparing firms on platforms like propfirmkey.com, clean and reliable data makes all the difference.


What data sources do you use for your trading analysis?

Top comments (0)