API documentation follows predictable patterns — endpoints, parameters, response schemas. What if you could scrape docs from any API and auto-generate a Python SDK? That's exactly what we'll build.
The Problem
Every SaaS product has an API. Few have good SDKs. Developers waste hours reading docs and writing boilerplate HTTP calls. Automating this saves massive time.
Architecture
- Scrape API documentation pages
- Parse endpoint definitions (method, path, params, response)
- Generate typed Python SDK classes
- Output ready-to-use wrapper code
Scraping Documentation Pages
import requests
from bs4 import BeautifulSoup
import re
API_KEY = "YOUR_SCRAPERAPI_KEY"
def scrape_docs(docs_url):
params = {
"api_key": API_KEY,
"url": docs_url,
"render": "true"
}
response = requests.get(
"https://api.scraperapi.com", params=params, timeout=60
)
return BeautifulSoup(response.text, "html.parser")
ScraperAPI renders JavaScript — critical for modern docs platforms like ReadMe, GitBook, and Swagger UI that load content dynamically.
Extracting Endpoint Definitions
Most docs follow a pattern: HTTP method + path + parameter table + response example.
def extract_endpoints(soup):
endpoints = []
for section in soup.find_all(["div", "section"], class_=re.compile(
r"(endpoint|api-method|operation)", re.I
)):
method_el = section.find(class_=re.compile(r"(method|verb|http-method)"))
path_el = section.find(class_=re.compile(r"(path|url|endpoint-url)"))
if not method_el or not path_el:
continue
endpoint = {
"method": method_el.get_text(strip=True).upper(),
"path": path_el.get_text(strip=True),
"params": extract_params(section),
"description": extract_description(section),
}
endpoints.append(endpoint)
return endpoints
def extract_params(section):
params = []
table = section.find("table")
if table:
rows = table.find_all("tr")[1:]
for row in rows:
cols = row.find_all("td")
if len(cols) >= 3:
params.append({
"name": cols[0].get_text(strip=True),
"type": cols[1].get_text(strip=True),
"required": "required" in cols[2].get_text(strip=True).lower(),
"description": cols[2].get_text(strip=True),
})
return params
Generating the SDK
def generate_sdk(api_name, base_url, endpoints):
class_name = api_name.replace(" ", "").replace("-", "") + "Client"
lines = [
f'"""Auto-generated SDK for {api_name}"""',
"import requests",
"from typing import Optional, Any",
"",
f"class {class_name}:",
f' def __init__(self, api_key: str, base_url: str = "{base_url}"):',
" self.api_key = api_key",
" self.base_url = base_url.rstrip('/')",
' self.session = requests.Session()',
' self.session.headers["Authorization"] = f"Bearer {api_key}"',
]
for ep in endpoints:
method_name = path_to_method_name(ep["path"], ep["method"])
func_params = build_func_params(ep["params"])
lines.extend(generate_method(ep, method_name, func_params))
return "\n".join(lines)
def path_to_method_name(path, method):
clean = re.sub(r"[{}]", "", path)
parts = [p for p in clean.strip("/").split("/") if p]
prefix = {"GET": "get", "POST": "create", "PUT": "update",
"PATCH": "update", "DELETE": "delete"}.get(method, method.lower())
return f"{prefix}_{'_'.join(parts[-2:])}"
Handling OpenAPI/Swagger Specs
Many APIs publish OpenAPI specs — parse those directly for better accuracy:
def parse_openapi_spec(spec_url):
resp = requests.get(spec_url)
spec = resp.json()
endpoints = []
for path, methods in spec.get("paths", {}).items():
for method, details in methods.items():
if method in ("get", "post", "put", "patch", "delete"):
endpoints.append({
"method": method.upper(),
"path": path,
"description": details.get("summary", ""),
"params": [
{"name": p["name"], "type": p.get("schema", {}).get("type", "str"),
"required": p.get("required", False)}
for p in details.get("parameters", [])
],
})
return endpoints
Putting It Together
def build_sdk_from_docs(docs_url, api_name, base_url):
soup = scrape_docs(docs_url)
endpoints = extract_endpoints(soup)
print(f"Found {len(endpoints)} endpoints")
sdk_code = generate_sdk(api_name, base_url, endpoints)
output_file = f"{api_name.lower().replace(' ', '_')}_client.py"
with open(output_file, "w") as f:
f.write(sdk_code)
return sdk_code
For scraping docs from dozens of SaaS providers, use ThorData for proxy rotation and ScrapeOps for monitoring.
Production Enhancements
- Add response type hints from example responses
- Generate async versions using
httpx - Add retry logic and rate limiting
- Publish to PyPI automatically with version bumps
Auto-generating SDKs from documentation turns hours of boilerplate into seconds. Combine web scraping with code generation, and you've got a tool that writes Python clients for any API.
Happy scraping!
Top comments (0)