There are over 4.8 million active Shopify stores worldwide. If you're doing competitor research, building a price comparison tool, tracking product trends, or sourcing for dropshipping, Shopify stores are a goldmine of structured product data.
The best part? Shopify has a little-known public endpoint that makes scraping significantly easier than most ecommerce platforms.
The Shopify products.json Endpoint
Every Shopify store exposes a public JSON API at /products.json. No authentication needed:
https://store-name.myshopify.com/products.json
https://custom-domain.com/products.json
This returns up to 250 products per page with full details: titles, descriptions, prices, variants, images, inventory availability, and more.
Basic Shopify Scraper
Here's a straightforward Python scraper that pulls every product from a Shopify store:
import requests
import time
import csv
def scrape_shopify_store(domain: str) -> list[dict]:
"""Scrape all products from a Shopify store."""
products = []
page = 1
while True:
url = f"https://{domain}/products.json?limit=250&page={page}"
response = requests.get(url, timeout=30, headers={
"User-Agent": "Mozilla/5.0 (compatible; ProductResearch/1.0)"
})
if response.status_code == 430:
print(f"Rate limited on page {page}, waiting 30s...")
time.sleep(30)
continue
if response.status_code != 200:
print(f"Got status {response.status_code} on page {page}")
break
data = response.json()
batch = data.get("products", [])
if not batch:
break
for product in batch:
for variant in product.get("variants", []):
products.append({
"title": product["title"],
"product_type": product.get("product_type", ""),
"vendor": product.get("vendor", ""),
"variant": variant.get("title", "Default"),
"price": variant.get("price"),
"compare_at_price": variant.get("compare_at_price"),
"sku": variant.get("sku"),
"available": variant.get("available", False),
"created_at": product.get("created_at"),
"updated_at": product.get("updated_at"),
"tags": ", ".join(product.get("tags", [])),
"image": product.get("images", [{}])[0].get("src", ""),
"url": f"https://{domain}/products/{product['handle']}",
})
print(f"Page {page}: {len(batch)} products")
page += 1
time.sleep(1)
return products
# Scrape a store
products = scrape_shopify_store("allbirds.com")
print(f"Total variants: {len(products)}")
Exporting to CSV
def export_to_csv(products: list[dict], filename: str = "products.csv"):
"""Export scraped products to CSV."""
if not products:
return
with open(filename, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=products[0].keys())
writer.writeheader()
writer.writerows(products)
print(f"Exported {len(products)} products to {filename}")
export_to_csv(products)
Scraping Multiple Stores for Competitive Analysis
For comparing competitors, batch scrape and save each store separately:
import json
from pathlib import Path
COMPETITORS = [
"allbirds.com",
"gymshark.com",
"fashionnova.com",
"colourpop.com",
"kyliecosmetics.com",
]
def scrape_competitors(domains: list[str], output_dir: str = "data"):
"""Scrape multiple Shopify stores and save results."""
Path(output_dir).mkdir(exist_ok=True)
for domain in domains:
print(f"\nScraping {domain}...")
try:
products = scrape_shopify_store(domain)
output_file = Path(output_dir) / f"{domain.replace('.', '_')}.json"
with open(output_file, "w") as f:
json.dump(products, f, indent=2)
print(f" Saved {len(products)} variants to {output_file}")
except Exception as e:
print(f" Failed: {e}")
time.sleep(5) # Be nice between stores
scrape_competitors(COMPETITORS)
Detecting if a Site Runs Shopify
Not sure if a website is on Shopify? Check programmatically:
def is_shopify(domain: str) -> bool:
"""Detect if a website runs on Shopify."""
try:
resp = requests.get(
f"https://{domain}/products.json?limit=1",
timeout=10,
allow_redirects=True,
)
if resp.status_code == 200:
data = resp.json()
return "products" in data
except Exception:
pass
# Fallback: check for Shopify headers
try:
resp = requests.head(f"https://{domain}", timeout=10)
return "shopify" in resp.headers.get("x-shopify-stage", "").lower() \
or "shopify" in resp.headers.get("server", "").lower()
except Exception:
return False
# Test some domains
for domain in ["allbirds.com", "nike.com", "gymshark.com"]:
print(f"{domain}: {'Shopify' if is_shopify(domain) else 'Not Shopify'}")
Handling Common Challenges
Rate Limiting (HTTP 430)
Shopify returns status 430 when you hit their rate limit. Solutions:
- Add delays — 1-2 seconds between pages, 5+ seconds between stores
- Use ScraperAPI — rotates IPs automatically so you spread requests across many addresses
- Use ThorData residential proxies — a pool of real residential IPs makes your requests look like normal browser traffic
def scrape_with_proxy(domain: str, proxy_url: str) -> list[dict]:
"""Scrape with rotating proxy to avoid rate limits."""
products = []
page = 1
while True:
url = f"https://{domain}/products.json?limit=250&page={page}"
response = requests.get(url, proxies={
"http": proxy_url,
"https": proxy_url,
}, timeout=30)
if response.status_code != 200:
break
data = response.json()
batch = data.get("products", [])
if not batch:
break
products.extend(batch)
page += 1
time.sleep(1)
return products
Stores That Block products.json
Some merchants disable the public endpoint. In that case:
- Scrape the collection pages HTML instead (
/collections/all) - Use ScraperAPI with
render=trueto execute JavaScript - Check out pre-built Shopify scrapers on Apify that handle edge cases like custom themes and pagination
Getting Collection and Category Data
def scrape_collections(domain: str) -> list[dict]:
"""Get all product collections/categories."""
url = f"https://{domain}/collections.json"
response = requests.get(url, timeout=30)
if response.status_code != 200:
return []
collections = response.json().get("collections", [])
return [{
"title": c["title"],
"handle": c["handle"],
"products_count": c.get("products_count"),
"url": f"https://{domain}/collections/{c['handle']}",
} for c in collections]
Building a Price Monitor
The real power is in running this daily to track price changes:
import sqlite3
from datetime import datetime
def init_db(db_path: str = "prices.db"):
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS price_history (
domain TEXT,
sku TEXT,
title TEXT,
price REAL,
compare_at_price REAL,
available BOOLEAN,
scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
return conn
def record_prices(conn, domain: str, products: list[dict]):
for p in products:
conn.execute(
"INSERT INTO price_history (domain, sku, title, price, compare_at_price, available) VALUES (?, ?, ?, ?, ?, ?)",
(domain, p["sku"], p["title"], float(p["price"]),
float(p["compare_at_price"]) if p["compare_at_price"] else None,
p["available"])
)
conn.commit()
# Run daily via cron
conn = init_db()
products = scrape_shopify_store("competitor.com")
record_prices(conn, "competitor.com", products)
Set this up as a cron job and you have a full competitor price tracker running for free.
Legal Notes
The products.json endpoint serves publicly available data that Shopify intentionally exposes. That said:
- Don't hammer stores with high-frequency requests
- Don't scrape and republish product descriptions (copyright)
- Don't use scraped data for fake reviews or misleading comparisons
- Respect rate limits — the 430 status exists for a reason
Conclusion
Shopify's public JSON API makes it one of the easiest ecommerce platforms to scrape. For stores that block the endpoint, tools like ScraperAPI and ThorData proxies handle the heavy lifting. And if you'd rather skip the code, Apify has cloud-based Shopify scrapers ready to go.
Happy scraping — let me know in the comments what you're building!
Top comments (0)