DEV Community

agenthustler
agenthustler

Posted on

How to Build a Citation Graph for Academic Paper Networks

Academic citations form a rich network that reveals research influence and knowledge flow. Here's how to scrape citation data and build a citation graph for academic analysis.

Why Build a Citation Graph?

Citation networks help identify seminal papers, discover research clusters, track idea propagation, and find under-cited work that deserves attention.

Setup

import requests
import networkx as nx
import pandas as pd
from collections import deque
import time
Enter fullscreen mode Exit fullscreen mode

Using Semantic Scholar API

def get_paper_details(paper_id):
    url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}"
    params = {
        "fields": "title,authors,year,citationCount,references.title,references.paperId,citations.title,citations.paperId"
    }
    resp = requests.get(url, params=params, timeout=15)
    return resp.json() if resp.status_code == 200 else None

def search_papers(query, limit=20):
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {"query": query, "limit": limit, "fields": "title,authors,year,citationCount,paperId"}
    resp = requests.get(url, params=params, timeout=15)
    return resp.json().get("data", []) if resp.status_code == 200 else []
Enter fullscreen mode Exit fullscreen mode

Building the Citation Graph

def build_citation_graph(seed_paper_id, depth=2, max_papers=200):
    G = nx.DiGraph()
    queue = deque([(seed_paper_id, 0)])
    visited = set()

    while queue and len(visited) < max_papers:
        paper_id, current_depth = queue.popleft()

        if paper_id in visited or current_depth > depth:
            continue

        visited.add(paper_id)
        paper = get_paper_details(paper_id)
        if not paper:
            continue

        G.add_node(paper_id, **{
            "title": paper.get("title", "Unknown"),
            "year": paper.get("year"),
            "citation_count": paper.get("citationCount", 0),
            "authors": ", ".join(a["name"] for a in paper.get("authors", [])[:3])
        })

        for ref in paper.get("references", []) or []:
            if ref.get("paperId"):
                G.add_edge(paper_id, ref["paperId"])
                if current_depth + 1 <= depth:
                    queue.append((ref["paperId"], current_depth + 1))

        for cit in paper.get("citations", []) or []:
            if cit.get("paperId"):
                G.add_edge(cit["paperId"], paper_id)
                if current_depth + 1 <= depth:
                    queue.append((cit["paperId"], current_depth + 1))

        time.sleep(1)

    return G
Enter fullscreen mode Exit fullscreen mode

Analyzing the Network

def analyze_citation_network(G):
    pagerank = nx.pagerank(G)
    betweenness = nx.betweenness_centrality(G)

    return {
        "total_papers": G.number_of_nodes(),
        "total_citations": G.number_of_edges(),
        "top_by_pagerank": sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10],
        "top_bridges": sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:10],
    }

def print_analysis(G, results):
    print(f"\nCitation Network Analysis")
    print(f"Papers: {results['total_papers']} | Citations: {results['total_citations']}")
    print("\nMost Influential (PageRank):")
    for pid, score in results["top_by_pagerank"][:5]:
        title = G.nodes[pid].get("title", "Unknown")[:60]
        print(f"  {score:.4f} - {title}")
Enter fullscreen mode Exit fullscreen mode

Exporting for Visualization

def export_graph(G, filename="citation_graph"):
    nx.write_gexf(G, f"{filename}.gexf")
    nodes_data = [{
        "id": node, "title": attrs.get("title", ""),
        "year": attrs.get("year"), "citations": attrs.get("citation_count", 0)
    } for node, attrs in G.nodes(data=True)]
    pd.DataFrame(nodes_data).to_csv(f"{filename}_nodes.csv", index=False)
Enter fullscreen mode Exit fullscreen mode

Handling Rate Limits

  • ScraperAPI — Reliable proxy rotation for Google Scholar and PubMed
  • ThorData — Residential proxies for sites blocking datacenter IPs
  • ScrapeOps — Monitor API success rates and quota usage

Conclusion

Citation graphs transform flat reference lists into rich knowledge networks. Start with a seed paper in your field and explore outward.

Top comments (0)