Wikipedia contains millions of interconnected articles — a perfect foundation for building knowledge graphs. In this tutorial, we extract entities and relationships to construct a navigable knowledge graph.
Why Knowledge Graphs?
Knowledge graphs power Google answer boxes, recommendation engines, and AI assistants. Building one from Wikipedia creates a structured dataset mapping how concepts relate — far more useful than raw text.
Setup
import requests
from bs4 import BeautifulSoup
import networkx as nx
import json, time
class WikiKnowledgeGraph:
def __init__(self):
self.graph = nx.DiGraph()
self.visited = set()
def fetch_page(self, title):
resp = requests.get("https://en.wikipedia.org/w/api.php",
params={"action":"parse","page":title,"format":"json","prop":"text|links|categories"})
return resp.json() if resp.status_code == 200 else None
Extracting Entities and Relationships
def extract_relationships(self, title, depth=2):
if title in self.visited or depth == 0:
return
self.visited.add(title)
data = self.fetch_page(title)
if not data or "parse" not in data:
return
html = data["parse"]["text"]["*"]
soup = BeautifulSoup(html, "html.parser")
first_para = soup.find("p", class_=lambda x: x != "mw-empty-elt")
desc = first_para.get_text().strip()[:200] if first_para else ""
self.graph.add_node(title, description=desc)
for link in data["parse"]["links"]:
if link.get("ns") == 0 and link.get("exists") == "":
self.graph.add_edge(title, link["*"], type="links_to")
for cat in data["parse"].get("categories", [])[:5]:
name = cat["*"].replace("_", " ")
if not name.startswith("Articles"):
self.graph.add_node(name, type="category")
self.graph.add_edge(title, name, type="belongs_to")
for link in data["parse"]["links"][:3]:
if link.get("ns") == 0:
time.sleep(0.5)
self.extract_relationships(link["*"], depth - 1)
Infobox Parser
def parse_infobox(self, soup):
infobox = soup.find("table", class_="infobox")
if not infobox:
return {}
data = {}
for row in infobox.find_all("tr"):
th, td = row.find("th"), row.find("td")
if th and td:
data[th.get_text().strip()] = td.get_text().strip()
for a in td.find_all("a", href=True):
if a["href"].startswith("/wiki/"):
self.graph.add_edge("entity",
a["href"].replace("/wiki/","").replace("_"," "),
type=th.get_text().strip().lower())
return data
Querying the Graph
def find_path(self, source, target):
try: return nx.shortest_path(self.graph, source, target)
except nx.NetworkXNoPath: return None
def get_related(self, entity, depth=2):
if entity not in self.graph: return []
return list(nx.ego_graph(self.graph, entity, radius=depth).nodes())
def export(self, filename="wiki_kg.json"):
with open(filename, "w") as f:
json.dump(nx.node_link_data(self.graph), f, indent=2)
print(f"Exported {self.graph.number_of_nodes()} nodes, {self.graph.number_of_edges()} edges")
kg = WikiKnowledgeGraph()
for seed in ["Machine learning", "Neural network", "Python (programming language)"]:
kg.extract_relationships(seed, depth=2)
time.sleep(1)
kg.export()
Scaling
For large-scale extraction, ScraperAPI handles rotation and retries, ThorData provides residential proxies, and ScrapeOps monitors pipeline health.
What You Can Build
- AI chatbot knowledge base grounded in structured facts
- Research tools mapping connections between concepts
- Content recommendation via graph proximity
- Fact verification against structured Wikipedia data
Top comments (0)