Real-time financial data is essential for building trading tools and investment dashboards. Here's how to scrape stock data from Yahoo Finance and other sources.
Why Scrape Stock Data?
While APIs like Alpha Vantage exist, they often have strict rate limits. Scraping gives you:
- Real-time price data without API key limits
- Historical data going back decades
- Financial statements and analyst estimates
- Options chain data
Yahoo Finance: The Go-To Source
pip install requests beautifulsoup4 lxml pandas
Scraping Stock Quotes
import requests
from bs4 import BeautifulSoup
import json, time
class YahooFinanceScraper:
BASE_URL = "https://finance.yahoo.com"
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
})
def get_historical_data(self, ticker, period="1y"):
url = (f"https://query1.finance.yahoo.com/v8/finance/chart/{ticker}"
f"?range={period}&interval=1d")
response = self.session.get(url)
data = response.json()
result = data["chart"]["result"][0]
timestamps = result["timestamp"]
quotes = result["indicators"]["quote"][0]
import pandas as pd
from datetime import datetime
df = pd.DataFrame({
"date": [datetime.fromtimestamp(t) for t in timestamps],
"open": quotes["open"],
"high": quotes["high"],
"low": quotes["low"],
"close": quotes["close"],
"volume": quotes["volume"]
})
return df
def get_options_chain(self, ticker):
url = f"https://query1.finance.yahoo.com/v7/finance/options/{ticker}"
response = self.session.get(url)
data = response.json()
options = data["optionChain"]["result"][0]
calls = options.get("options", [{}])[0].get("calls", [])
puts = options.get("options", [{}])[0].get("puts", [])
return {"calls": calls, "puts": puts}
Alternative Sources
Alpha Vantage (Free API)
def get_alpha_vantage_data(ticker, api_key):
url = (f"https://www.alphavantage.co/query"
f"?function=TIME_SERIES_DAILY&symbol={ticker}&apikey={api_key}")
response = requests.get(url)
return response.json().get("Time Series (Daily)", {})
Multi-Source Aggregator
class StockDataAggregator:
def __init__(self):
self.yahoo = YahooFinanceScraper()
self.cache = {}
def get_price(self, ticker):
cache_key = f"{ticker}_price"
if cache_key in self.cache:
cached = self.cache[cache_key]
if time.time() - cached["time"] < 60:
return cached["data"]
try:
df = self.yahoo.get_historical_data(ticker, period="1d")
if not df.empty:
latest = df.iloc[-1]
result = {
"close": latest["close"],
"volume": latest["volume"],
"date": str(latest["date"])
}
self.cache[cache_key] = {"data": result, "time": time.time()}
return result
except Exception as e:
print(f"Yahoo Finance failed: {e}")
return None
def get_watchlist(self, tickers):
results = {}
for ticker in tickers:
results[ticker] = self.get_price(ticker)
time.sleep(1)
return results
Real-Time Streaming
import websocket, json
def stream_stock_data(tickers):
def on_message(ws, message):
data = json.loads(message)
print(f"{data.get('id')}: ${data.get('price', 'N/A')}")
def on_open(ws):
ws.send(json.dumps({"subscribe": tickers}))
ws = websocket.WebSocketApp(
"wss://streamer.finance.yahoo.com/",
on_message=on_message,
on_open=on_open
)
ws.run_forever()
Scaling Your Financial Scraper
For production scrapers, ScraperAPI prevents IP bans when fetching hundreds of tickers. ThorData provides residential proxies for sites blocking datacenter IPs.
Monitor your financial scrapers with ScrapeOps to catch outages quickly — stale financial data is worse than no data.
Important Disclaimers
- Yahoo Finance data has a 15-minute delay for free users
- Don't use scraped data for automated trading without validation
- Financial data scraping may violate some ToS
- This is for educational purposes — consult a financial advisor
Conclusion
Building a stock data scraper gives you unprecedented access to financial information. Start with Yahoo Finance's semi-public endpoints, add alternative sources for redundancy, and implement proper caching to minimize requests.
Top comments (0)