DEV Community

agenthustler
agenthustler

Posted on

Wikipedia Data Extraction with Python: Complete Guide for 2026

Wikipedia is the largest free knowledge base on the internet. With structured infoboxes, categories, and interlinked articles, it's a goldmine for NLP datasets, knowledge graphs, and research. Here's how to extract Wikipedia data efficiently using both the API and direct scraping.

Wikipedia API vs Scraping

Wikipedia provides a comprehensive API (MediaWiki API) that should be your first choice. Scraping is only needed for data the API doesn't expose well.

Using the Wikipedia API

import requests
import json

WIKI_API = "https://en.wikipedia.org/w/api.php"

def get_article_content(title):
    """Get full article content via the API."""
    params = {
        "action": "query",
        "titles": title,
        "prop": "extracts|pageimages|categories|links",
        "exintro": False,
        "explaintext": True,
        "pithumbsize": 500,
        "cllimit": 50,
        "pllimit": 50,
        "format": "json",
    }

    response = requests.get(WIKI_API, params=params)
    data = response.json()

    pages = data.get("query", {}).get("pages", {})
    for page_id, page in pages.items():
        return {
            "title": page.get("title"),
            "page_id": page_id,
            "content": page.get("extract", ""),
            "thumbnail": page.get("thumbnail", {}).get("source"),
            "categories": [c["title"] for c in page.get("categories", [])],
            "links": [l["title"] for l in page.get("links", [])],
        }

article = get_article_content("Python (programming language)")
print(f"Title: {article['title']}")
print(f"Content length: {len(article['content'])} chars")
print(f"Categories: {len(article['categories'])}")
Enter fullscreen mode Exit fullscreen mode

Searching Wikipedia

def search_wikipedia(query, limit=10):
    """Search Wikipedia and return matching articles."""
    params = {
        "action": "query",
        "list": "search",
        "srsearch": query,
        "srlimit": limit,
        "format": "json",
    }

    response = requests.get(WIKI_API, params=params)
    data = response.json()

    results = []
    for item in data.get("query", {}).get("search", []):
        results.append({
            "title": item["title"],
            "page_id": item["pageid"],
            "snippet": item["snippet"],
            "word_count": item["wordcount"],
        })

    return results

results = search_wikipedia("machine learning algorithms")
for r in results:
    print(f"  {r['title']} ({r['word_count']} words)")
Enter fullscreen mode Exit fullscreen mode

Extracting Infobox Data

Infoboxes contain structured data (population, area, founding date, etc.). The API returns this as wikitext, which needs parsing:

import re

def get_infobox(title):
    """Extract infobox data from a Wikipedia article."""
    params = {
        "action": "query",
        "titles": title,
        "prop": "revisions",
        "rvprop": "content",
        "rvsection": 0,
        "format": "json",
    }

    response = requests.get(WIKI_API, params=params)
    data = response.json()

    pages = data.get("query", {}).get("pages", {})
    for page_id, page in pages.items():
        content = page.get("revisions", [{}])[0].get("*", "")

        # Parse infobox
        infobox = {}
        infobox_match = re.search(r'\{\{Infobox(.+?)\n\}\}', content, re.DOTALL)

        if infobox_match:
            infobox_text = infobox_match.group(1)
            # Extract key-value pairs
            pairs = re.findall(r'\|\s*(.+?)\s*=\s*(.+?)(?=\n\||\n\})', infobox_text)
            for key, value in pairs:
                # Clean up wiki markup
                clean_value = re.sub(r'\[\[([^|\]]*\|)?([^\]]*)\]\]', r'\2', value)
                clean_value = re.sub(r'\{\{[^}]*\}\}', '', clean_value).strip()
                infobox[key.strip()] = clean_value

        return infobox

city_info = get_infobox("San Francisco")
for key, value in list(city_info.items())[:10]:
    print(f"  {key}: {value}")
Enter fullscreen mode Exit fullscreen mode

Bulk Data Extraction with Categories

def get_category_members(category, limit=100):
    """Get all articles in a Wikipedia category."""
    params = {
        "action": "query",
        "list": "categorymembers",
        "cmtitle": f"Category:{category}",
        "cmlimit": limit,
        "cmtype": "page",
        "format": "json",
    }

    members = []
    while True:
        response = requests.get(WIKI_API, params=params)
        data = response.json()

        for member in data.get("query", {}).get("categorymembers", []):
            members.append({
                "title": member["title"],
                "page_id": member["pageid"],
            })

        # Handle pagination
        if "continue" in data:
            params["cmcontinue"] = data["continue"]["cmcontinue"]
        else:
            break

    return members

programming_languages = get_category_members("Programming languages")
print(f"Found {len(programming_languages)} programming languages")
Enter fullscreen mode Exit fullscreen mode

Building a Knowledge Dataset

import csv
import time

def build_dataset(category, output_file, max_articles=50):
    """Build a structured dataset from a Wikipedia category."""
    members = get_category_members(category, limit=max_articles)

    articles = []
    for i, member in enumerate(members[:max_articles]):
        print(f"Processing {i+1}/{min(len(members), max_articles)}: {member['title']}")

        article = get_article_content(member["title"])
        if article:
            infobox = get_infobox(member["title"])
            article["infobox"] = infobox
            articles.append(article)

        time.sleep(0.5)  # Respect rate limits

    # Export
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(articles, f, indent=2, ensure_ascii=False)

    print(f"\nSaved {len(articles)} articles to {output_file}")
    return articles

# Build a dataset of programming languages
dataset = build_dataset(
    "Programming languages",
    "programming_languages.json",
    max_articles=30
)
Enter fullscreen mode Exit fullscreen mode

Using the wikipedia Python Package

For simpler use cases, the wikipedia package wraps the API:

# pip install wikipedia-api
import wikipediaapi

wiki = wikipediaapi.Wikipedia(
    user_agent="MyBot/1.0 (myemail@example.com)",
    language="en"
)

page = wiki.page("Web scraping")

if page.exists():
    print(f"Title: {page.title}")
    print(f"Summary: {page.summary[:200]}...")
    print(f"Full text: {len(page.text)} chars")
    print(f"Links: {len(page.links)} outgoing links")
    print(f"Categories: {len(page.categories)}")
Enter fullscreen mode Exit fullscreen mode

Handling Proxies for Large-Scale Extraction

While Wikipedia is generally scraping-friendly, extracting thousands of articles quickly may trigger rate limits. ScraperAPI can help distribute requests across multiple IPs for large-scale Wikipedia data projects.

Best Practices

  1. Always use the API first — It's faster, more reliable, and explicitly allowed
  2. Set a proper User-Agent — Wikipedia requires it for API access
  3. Respect rate limits — Max 200 requests/second for the API
  4. Cache aggressively — Wikipedia content doesn't change every minute
  5. Use dumps for bulk data — For millions of articles, download Wikipedia dumps instead

Conclusion

Wikipedia data extraction is one of the most accessible and legal scraping projects you can undertake. Start with the MediaWiki API for structured access, use the wikipedia-api package for quick scripts, and resort to HTML scraping only when you need data the API doesn't provide.

Top comments (0)