How to Build a Citation Graph for Academic Paper Networks

#python #tutorial #webdev #programming

Academic citations form a rich network that reveals research influence and knowledge flow. Here's how to scrape citation data and build a citation graph for academic analysis.

Why Build a Citation Graph?

Citation networks help identify seminal papers, discover research clusters, track idea propagation, and find under-cited work that deserves attention.

Setup

import requests
import networkx as nx
import pandas as pd
from collections import deque
import time

Using Semantic Scholar API

def get_paper_details(paper_id):
    url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}"
    params = {
        "fields": "title,authors,year,citationCount,references.title,references.paperId,citations.title,citations.paperId"
    }
    resp = requests.get(url, params=params, timeout=15)
    return resp.json() if resp.status_code == 200 else None

def search_papers(query, limit=20):
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {"query": query, "limit": limit, "fields": "title,authors,year,citationCount,paperId"}
    resp = requests.get(url, params=params, timeout=15)
    return resp.json().get("data", []) if resp.status_code == 200 else []

Building the Citation Graph

def build_citation_graph(seed_paper_id, depth=2, max_papers=200):
    G = nx.DiGraph()
    queue = deque([(seed_paper_id, 0)])
    visited = set()

    while queue and len(visited) < max_papers:
        paper_id, current_depth = queue.popleft()

        if paper_id in visited or current_depth > depth:
            continue

        visited.add(paper_id)
        paper = get_paper_details(paper_id)
        if not paper:
            continue

        G.add_node(paper_id, **{
            "title": paper.get("title", "Unknown"),
            "year": paper.get("year"),
            "citation_count": paper.get("citationCount", 0),
            "authors": ", ".join(a["name"] for a in paper.get("authors", [])[:3])
        })

        for ref in paper.get("references", []) or []:
            if ref.get("paperId"):
                G.add_edge(paper_id, ref["paperId"])
                if current_depth + 1 <= depth:
                    queue.append((ref["paperId"], current_depth + 1))

        for cit in paper.get("citations", []) or []:
            if cit.get("paperId"):
                G.add_edge(cit["paperId"], paper_id)
                if current_depth + 1 <= depth:
                    queue.append((cit["paperId"], current_depth + 1))

        time.sleep(1)

    return G

Analyzing the Network

def analyze_citation_network(G):
    pagerank = nx.pagerank(G)
    betweenness = nx.betweenness_centrality(G)

    return {
        "total_papers": G.number_of_nodes(),
        "total_citations": G.number_of_edges(),
        "top_by_pagerank": sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10],
        "top_bridges": sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:10],
    }

def print_analysis(G, results):
    print(f"\nCitation Network Analysis")
    print(f"Papers: {results['total_papers']} | Citations: {results['total_citations']}")
    print("\nMost Influential (PageRank):")
    for pid, score in results["top_by_pagerank"][:5]:
        title = G.nodes[pid].get("title", "Unknown")[:60]
        print(f"  {score:.4f} - {title}")

Exporting for Visualization

def export_graph(G, filename="citation_graph"):
    nx.write_gexf(G, f"{filename}.gexf")
    nodes_data = [{
        "id": node, "title": attrs.get("title", ""),
        "year": attrs.get("year"), "citations": attrs.get("citation_count", 0)
    } for node, attrs in G.nodes(data=True)]
    pd.DataFrame(nodes_data).to_csv(f"{filename}_nodes.csv", index=False)

Handling Rate Limits

ScraperAPI — Reliable proxy rotation for Google Scholar and PubMed
ThorData — Residential proxies for sites blocking datacenter IPs
ScrapeOps — Monitor API success rates and quota usage

Conclusion

Citation graphs transform flat reference lists into rich knowledge networks. Start with a seed paper in your field and explore outward.

DEV Community