How to Detect Website Technology Stacks with Scraping

#python #tutorial #webdev #programming

Knowing what technologies a website uses is valuable for sales prospecting, competitive intelligence, and security research. Tools like BuiltWith charge hundreds per month -- here's how to build your own detector.

What We Can Detect

By analyzing HTTP headers, HTML source, JavaScript files, and cookies, we can identify frameworks, CMS platforms, analytics tools, CDNs, and more.

Technology Stack Detector

pip install requests beautifulsoup4 tldextract

import requests
from bs4 import BeautifulSoup
import json

class TechStackDetector:
    def __init__(self, api_key):
        self.api_key = api_key
        self.signatures = {
            "WordPress": {"html": ["wp-content", "wp-includes"]},
            "React": {"html": ["__NEXT_DATA__", "react-root", "_reactRootContainer"]},
            "Vue.js": {"html": ["__VUE__", "v-cloak", "vue-app"]},
            "Angular": {"html": ["ng-version", "ng-app"]},
            "Next.js": {"html": ["__NEXT_DATA__", "_next/static"]},
            "Shopify": {"html": ["cdn.shopify.com", "Shopify.theme"]},
            "Cloudflare": {"headers": {"CF-RAY": "", "Server": "cloudflare"}},
            "Google Analytics": {"html": ["google-analytics.com/analytics.js", "gtag/js"]},
            "Stripe": {"html": ["js.stripe.com"]},
            "Tailwind CSS": {"html": ["tailwindcss"]},
            "Bootstrap": {"html": ["bootstrap.min.css", "bootstrap.min.js"]},
        }

    def detect(self, url):
        proxy = f"http://api.scraperapi.com?api_key={self.api_key}&url={url}&render=true"
        resp = requests.get(proxy, timeout=30)
        soup = BeautifulSoup(resp.text, "html.parser")
        html = resp.text.lower()
        headers = dict(resp.headers)
        cookies = {c.name: c.value for c in resp.cookies}

        detected = []
        for tech, sigs in self.signatures.items():
            confidence = 0
            for pattern in sigs.get("html", []):
                if pattern.lower() in html:
                    confidence += 40
            for header, value in sigs.get("headers", {}).items():
                if header in headers:
                    confidence += 30
            for cookie in sigs.get("cookies", []):
                if cookie in cookies:
                    confidence += 30

            if confidence > 0:
                detected.append({"technology": tech, "confidence": min(confidence, 100)})

        return sorted(detected, key=lambda x: x["confidence"], reverse=True)

# Usage
detector = TechStackDetector("YOUR_SCRAPERAPI_KEY")
stack = detector.detect("https://example.com")
for tech in stack:
    print(f"  {tech['technology']}: {tech['confidence']}% confidence")

Bulk Detection for Prospecting

import pandas as pd
import time

def detect_bulk(domains, api_key):
    detector = TechStackDetector(api_key)
    results = []
    for domain in domains:
        url = f"https://{domain}"
        try:
            stack = detector.detect(url)
            for tech in stack:
                results.append({"domain": domain, **tech})
        except Exception:
            results.append({"domain": domain, "technology": "Error", "confidence": 0})
        time.sleep(2)
    return pd.DataFrame(results)

domains = ["store1.com", "store2.com", "store3.com"]
df = detect_bulk(domains, "YOUR_SCRAPERAPI_KEY")
shopify_sites = df[df["technology"] == "Shopify"]
print(f"Shopify stores found: {len(shopify_sites)}")

Scaling Up

For large-scale detection, ScraperAPI handles rendering and anti-bot challenges. ThorData proxies help when scanning many domains. Monitor accuracy with ScrapeOps.

Conclusion

Building your own tech stack detector gives you the same intelligence as expensive SaaS tools. The signature-based approach is extensible -- just add new patterns as you discover them. Perfect for lead generation, competitive research, and security auditing.

DEV Community