Academic citations form a rich network that reveals research influence and knowledge flow. Here's how to scrape citation data and build a citation graph for academic analysis.
Why Build a Citation Graph?
Citation networks help identify seminal papers, discover research clusters, track idea propagation, and find under-cited work that deserves attention.
Setup
import requests
import networkx as nx
import pandas as pd
from collections import deque
import time
Using Semantic Scholar API
def get_paper_details(paper_id):
url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}"
params = {
"fields": "title,authors,year,citationCount,references.title,references.paperId,citations.title,citations.paperId"
}
resp = requests.get(url, params=params, timeout=15)
return resp.json() if resp.status_code == 200 else None
def search_papers(query, limit=20):
url = "https://api.semanticscholar.org/graph/v1/paper/search"
params = {"query": query, "limit": limit, "fields": "title,authors,year,citationCount,paperId"}
resp = requests.get(url, params=params, timeout=15)
return resp.json().get("data", []) if resp.status_code == 200 else []
Building the Citation Graph
def build_citation_graph(seed_paper_id, depth=2, max_papers=200):
G = nx.DiGraph()
queue = deque([(seed_paper_id, 0)])
visited = set()
while queue and len(visited) < max_papers:
paper_id, current_depth = queue.popleft()
if paper_id in visited or current_depth > depth:
continue
visited.add(paper_id)
paper = get_paper_details(paper_id)
if not paper:
continue
G.add_node(paper_id, **{
"title": paper.get("title", "Unknown"),
"year": paper.get("year"),
"citation_count": paper.get("citationCount", 0),
"authors": ", ".join(a["name"] for a in paper.get("authors", [])[:3])
})
for ref in paper.get("references", []) or []:
if ref.get("paperId"):
G.add_edge(paper_id, ref["paperId"])
if current_depth + 1 <= depth:
queue.append((ref["paperId"], current_depth + 1))
for cit in paper.get("citations", []) or []:
if cit.get("paperId"):
G.add_edge(cit["paperId"], paper_id)
if current_depth + 1 <= depth:
queue.append((cit["paperId"], current_depth + 1))
time.sleep(1)
return G
Analyzing the Network
def analyze_citation_network(G):
pagerank = nx.pagerank(G)
betweenness = nx.betweenness_centrality(G)
return {
"total_papers": G.number_of_nodes(),
"total_citations": G.number_of_edges(),
"top_by_pagerank": sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10],
"top_bridges": sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:10],
}
def print_analysis(G, results):
print(f"\nCitation Network Analysis")
print(f"Papers: {results['total_papers']} | Citations: {results['total_citations']}")
print("\nMost Influential (PageRank):")
for pid, score in results["top_by_pagerank"][:5]:
title = G.nodes[pid].get("title", "Unknown")[:60]
print(f" {score:.4f} - {title}")
Exporting for Visualization
def export_graph(G, filename="citation_graph"):
nx.write_gexf(G, f"{filename}.gexf")
nodes_data = [{
"id": node, "title": attrs.get("title", ""),
"year": attrs.get("year"), "citations": attrs.get("citation_count", 0)
} for node, attrs in G.nodes(data=True)]
pd.DataFrame(nodes_data).to_csv(f"{filename}_nodes.csv", index=False)
Handling Rate Limits
- ScraperAPI — Reliable proxy rotation for Google Scholar and PubMed
- ThorData — Residential proxies for sites blocking datacenter IPs
- ScrapeOps — Monitor API success rates and quota usage
Conclusion
Citation graphs transform flat reference lists into rich knowledge networks. Start with a seed paper in your field and explore outward.
Top comments (0)