DEV Community

agenthustler
agenthustler

Posted on

How to Build a Wikipedia Knowledge Graph with Python

Wikipedia contains millions of interconnected articles — a perfect foundation for building knowledge graphs. In this tutorial, we extract entities and relationships to construct a navigable knowledge graph.

Why Knowledge Graphs?

Knowledge graphs power Google answer boxes, recommendation engines, and AI assistants. Building one from Wikipedia creates a structured dataset mapping how concepts relate — far more useful than raw text.

Setup

import requests
from bs4 import BeautifulSoup
import networkx as nx
import json, time

class WikiKnowledgeGraph:
    def __init__(self):
        self.graph = nx.DiGraph()
        self.visited = set()

    def fetch_page(self, title):
        resp = requests.get("https://en.wikipedia.org/w/api.php",
            params={"action":"parse","page":title,"format":"json","prop":"text|links|categories"})
        return resp.json() if resp.status_code == 200 else None
Enter fullscreen mode Exit fullscreen mode

Extracting Entities and Relationships

    def extract_relationships(self, title, depth=2):
        if title in self.visited or depth == 0:
            return
        self.visited.add(title)
        data = self.fetch_page(title)
        if not data or "parse" not in data:
            return

        html = data["parse"]["text"]["*"]
        soup = BeautifulSoup(html, "html.parser")
        first_para = soup.find("p", class_=lambda x: x != "mw-empty-elt")
        desc = first_para.get_text().strip()[:200] if first_para else ""
        self.graph.add_node(title, description=desc)

        for link in data["parse"]["links"]:
            if link.get("ns") == 0 and link.get("exists") == "":
                self.graph.add_edge(title, link["*"], type="links_to")

        for cat in data["parse"].get("categories", [])[:5]:
            name = cat["*"].replace("_", " ")
            if not name.startswith("Articles"):
                self.graph.add_node(name, type="category")
                self.graph.add_edge(title, name, type="belongs_to")

        for link in data["parse"]["links"][:3]:
            if link.get("ns") == 0:
                time.sleep(0.5)
                self.extract_relationships(link["*"], depth - 1)
Enter fullscreen mode Exit fullscreen mode

Infobox Parser

    def parse_infobox(self, soup):
        infobox = soup.find("table", class_="infobox")
        if not infobox:
            return {}
        data = {}
        for row in infobox.find_all("tr"):
            th, td = row.find("th"), row.find("td")
            if th and td:
                data[th.get_text().strip()] = td.get_text().strip()
                for a in td.find_all("a", href=True):
                    if a["href"].startswith("/wiki/"):
                        self.graph.add_edge("entity",
                            a["href"].replace("/wiki/","").replace("_"," "),
                            type=th.get_text().strip().lower())
        return data
Enter fullscreen mode Exit fullscreen mode

Querying the Graph

    def find_path(self, source, target):
        try: return nx.shortest_path(self.graph, source, target)
        except nx.NetworkXNoPath: return None

    def get_related(self, entity, depth=2):
        if entity not in self.graph: return []
        return list(nx.ego_graph(self.graph, entity, radius=depth).nodes())

    def export(self, filename="wiki_kg.json"):
        with open(filename, "w") as f:
            json.dump(nx.node_link_data(self.graph), f, indent=2)
        print(f"Exported {self.graph.number_of_nodes()} nodes, {self.graph.number_of_edges()} edges")

kg = WikiKnowledgeGraph()
for seed in ["Machine learning", "Neural network", "Python (programming language)"]:
    kg.extract_relationships(seed, depth=2)
    time.sleep(1)
kg.export()
Enter fullscreen mode Exit fullscreen mode

Scaling

For large-scale extraction, ScraperAPI handles rotation and retries, ThorData provides residential proxies, and ScrapeOps monitors pipeline health.

What You Can Build

  • AI chatbot knowledge base grounded in structured facts
  • Research tools mapping connections between concepts
  • Content recommendation via graph proximity
  • Fact verification against structured Wikipedia data

Top comments (0)