Wikipedia is the largest free knowledge base on the internet. With structured infoboxes, categories, and interlinked articles, it's a goldmine for NLP datasets, knowledge graphs, and research. Here's how to extract Wikipedia data efficiently using both the API and direct scraping.
Wikipedia API vs Scraping
Wikipedia provides a comprehensive API (MediaWiki API) that should be your first choice. Scraping is only needed for data the API doesn't expose well.
Using the Wikipedia API
import requests
import json
WIKI_API = "https://en.wikipedia.org/w/api.php"
def get_article_content(title):
"""Get full article content via the API."""
params = {
"action": "query",
"titles": title,
"prop": "extracts|pageimages|categories|links",
"exintro": False,
"explaintext": True,
"pithumbsize": 500,
"cllimit": 50,
"pllimit": 50,
"format": "json",
}
response = requests.get(WIKI_API, params=params)
data = response.json()
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
return {
"title": page.get("title"),
"page_id": page_id,
"content": page.get("extract", ""),
"thumbnail": page.get("thumbnail", {}).get("source"),
"categories": [c["title"] for c in page.get("categories", [])],
"links": [l["title"] for l in page.get("links", [])],
}
article = get_article_content("Python (programming language)")
print(f"Title: {article['title']}")
print(f"Content length: {len(article['content'])} chars")
print(f"Categories: {len(article['categories'])}")
Searching Wikipedia
def search_wikipedia(query, limit=10):
"""Search Wikipedia and return matching articles."""
params = {
"action": "query",
"list": "search",
"srsearch": query,
"srlimit": limit,
"format": "json",
}
response = requests.get(WIKI_API, params=params)
data = response.json()
results = []
for item in data.get("query", {}).get("search", []):
results.append({
"title": item["title"],
"page_id": item["pageid"],
"snippet": item["snippet"],
"word_count": item["wordcount"],
})
return results
results = search_wikipedia("machine learning algorithms")
for r in results:
print(f" {r['title']} ({r['word_count']} words)")
Extracting Infobox Data
Infoboxes contain structured data (population, area, founding date, etc.). The API returns this as wikitext, which needs parsing:
import re
def get_infobox(title):
"""Extract infobox data from a Wikipedia article."""
params = {
"action": "query",
"titles": title,
"prop": "revisions",
"rvprop": "content",
"rvsection": 0,
"format": "json",
}
response = requests.get(WIKI_API, params=params)
data = response.json()
pages = data.get("query", {}).get("pages", {})
for page_id, page in pages.items():
content = page.get("revisions", [{}])[0].get("*", "")
# Parse infobox
infobox = {}
infobox_match = re.search(r'\{\{Infobox(.+?)\n\}\}', content, re.DOTALL)
if infobox_match:
infobox_text = infobox_match.group(1)
# Extract key-value pairs
pairs = re.findall(r'\|\s*(.+?)\s*=\s*(.+?)(?=\n\||\n\})', infobox_text)
for key, value in pairs:
# Clean up wiki markup
clean_value = re.sub(r'\[\[([^|\]]*\|)?([^\]]*)\]\]', r'\2', value)
clean_value = re.sub(r'\{\{[^}]*\}\}', '', clean_value).strip()
infobox[key.strip()] = clean_value
return infobox
city_info = get_infobox("San Francisco")
for key, value in list(city_info.items())[:10]:
print(f" {key}: {value}")
Bulk Data Extraction with Categories
def get_category_members(category, limit=100):
"""Get all articles in a Wikipedia category."""
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": f"Category:{category}",
"cmlimit": limit,
"cmtype": "page",
"format": "json",
}
members = []
while True:
response = requests.get(WIKI_API, params=params)
data = response.json()
for member in data.get("query", {}).get("categorymembers", []):
members.append({
"title": member["title"],
"page_id": member["pageid"],
})
# Handle pagination
if "continue" in data:
params["cmcontinue"] = data["continue"]["cmcontinue"]
else:
break
return members
programming_languages = get_category_members("Programming languages")
print(f"Found {len(programming_languages)} programming languages")
Building a Knowledge Dataset
import csv
import time
def build_dataset(category, output_file, max_articles=50):
"""Build a structured dataset from a Wikipedia category."""
members = get_category_members(category, limit=max_articles)
articles = []
for i, member in enumerate(members[:max_articles]):
print(f"Processing {i+1}/{min(len(members), max_articles)}: {member['title']}")
article = get_article_content(member["title"])
if article:
infobox = get_infobox(member["title"])
article["infobox"] = infobox
articles.append(article)
time.sleep(0.5) # Respect rate limits
# Export
with open(output_file, "w", encoding="utf-8") as f:
json.dump(articles, f, indent=2, ensure_ascii=False)
print(f"\nSaved {len(articles)} articles to {output_file}")
return articles
# Build a dataset of programming languages
dataset = build_dataset(
"Programming languages",
"programming_languages.json",
max_articles=30
)
Using the wikipedia Python Package
For simpler use cases, the wikipedia package wraps the API:
# pip install wikipedia-api
import wikipediaapi
wiki = wikipediaapi.Wikipedia(
user_agent="MyBot/1.0 (myemail@example.com)",
language="en"
)
page = wiki.page("Web scraping")
if page.exists():
print(f"Title: {page.title}")
print(f"Summary: {page.summary[:200]}...")
print(f"Full text: {len(page.text)} chars")
print(f"Links: {len(page.links)} outgoing links")
print(f"Categories: {len(page.categories)}")
Handling Proxies for Large-Scale Extraction
While Wikipedia is generally scraping-friendly, extracting thousands of articles quickly may trigger rate limits. ScraperAPI can help distribute requests across multiple IPs for large-scale Wikipedia data projects.
Best Practices
- Always use the API first — It's faster, more reliable, and explicitly allowed
- Set a proper User-Agent — Wikipedia requires it for API access
- Respect rate limits — Max 200 requests/second for the API
- Cache aggressively — Wikipedia content doesn't change every minute
- Use dumps for bulk data — For millions of articles, download Wikipedia dumps instead
Conclusion
Wikipedia data extraction is one of the most accessible and legal scraping projects you can undertake. Start with the MediaWiki API for structured access, use the wikipedia-api package for quick scripts, and resort to HTML scraping only when you need data the API doesn't provide.
Top comments (0)