Graph + LLM or simply LLM for summarization?

#llm #python #ai

let's say I have some documents. I want to generate a summary (get an overall essence) may be based on certain criteria. What do you think is a better approach: creating a graph and then using LLM to generate summary or answer queries that involve most of the documents? Or create summaries of individual documents and then may be create a final summary. I came up with a code where i tried to get some news articles on a topic, create a graph and summarize. It is not perfect and I am trying to improve. But is it worth creating a graph? I have some opinions but I would love to yours..

import networkx as nx
from duckduckgo_search import DDGS
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
import typing_extensions as typing
import ast
import matplotlib.pyplot as plt
import time
import pickle


genai.configure(api_key="key")
model = genai.GenerativeModel("gemini-1.5-flash-002")
topic = "climate change"

#get data 
docs = {}
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:84.0) Gecko/20100101 Firefox/84.0",
}
results = DDGS().news(f'{topic}',timelimit='w', max_results=10)



for news in results:
    try:
        page = requests.get(news['url'], headers=headers,timeout=10).text
        soup = BeautifulSoup(page, "html.parser")
        body = soup.find('body')
        docs[news["url"]] = body.get_text(separator="\n",strip=True)
    except:
        print(f"unable to fetch {news['url']}")


#create graph
class Entity(typing.TypedDict):
    name: str
    type: str

class Relation(typing.TypedDict):
    source: str
    target: str
    relationship: str


G = nx.Graph()
possible_entities = ["Country", "Person", "Location", "Event", "Topic", "Policy", "Technology", "Other"]
for url in docs:
    try:
        response = model.generate_content(
        f"""news: {docs[url]} \n Based on the news article above, identify only the most relevant entities that capture the essence of the news.  Entity types must be strictly limited to the following: {possible_entities}. No other types are allowed. If no relevant entity is present, return an empty list. Return each entity along with its type.""",
        generation_config=genai.GenerationConfig(
            response_mime_type="application/json", response_schema=list[Entity]
        ),
        )
        entities = ast.literal_eval(response.text) 
        entity_dict = {}

        for entity in entities:
            if entity["name"] in possible_entities or entity["name"].lower().startswith("err"):
                continue
            entity_dict[entity["name"]] = entity["type"]
        if not entity_dict:
            continue
        print(entity_dict)
        response = model.generate_content(
        f"""news: {docs[url]} \n entities: {list(entity_dict.keys())} \n Based on the news article and the list of entities above, return the list of source and target entity pairs that have a clear relationship between them.(source name, target name,relationship). Choose entities only from the provided list. Relationship can include sentiment and opinions as well and should be 1 - 2 sentences, mentioning the entities and describing the relationship between them.""",
        generation_config=genai.GenerationConfig(
            response_mime_type="application/json", response_schema=list[Relation]
        ),
        )
        relationships = ast.literal_eval(response.text)
        print(relationships)
        for relation in relationships:
            source  = relation["source"].lower().strip()
            target = relation["target"].lower().strip()
            if source not in G:
                G.add_node(source)
            if target not in G:
                G.add_node(target)
            if G.has_edge(source,target):
                data = G[source][target]
                data["relationship"] = data["relationship"] + "\n" +relation["relationship"]+f"{url}"
            else:
                G.add_edge(source,target,relationship=relation["relationship"]+f"{url}")
        time.sleep(5)
    except Exception as e:
        print(e)

def shorten_edge_labels(edge_labels, max_characters=12):
    return {k: edge_labels[k][:max_characters] + '.. ' for k in edge_labels}

G.remove_nodes_from(list(nx.isolates(G)))

xml='\n'.join(nx.generate_graphml(G))
print(xml)
time.sleep(30)

response = model.generate_content(
        f"""graph: {xml} \n You are an expert in news storytelling. Based on the knowledge graph above, generate a compelling, professional and captivating story related to {topic} in 500-800 words. Be creative and effectively utilise relationship information between different news articles, but do not make up things. Exclude irrelevant information. Provide source URLs so the user can read more. Return only the story, without a title or additional explanation.""",)
print(response.text)

DEV Community

Graph + LLM or simply LLM for summarization?

Top comments (0)