DEV Community

artydev
artydev

Posted on

Extractive summarization in Python wit Sumy

# %%
# %pip install pymupdf
# %pip install frontend
# %pip install tools

# %%
import pymupdf  # PyMuPDF
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer

# %%
def extract_text_from_pdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return (text, len(doc))

# %%
def summarize_text(text, num_sentences=10):
    parser = PlaintextParser.from_string(text, Tokenizer("french"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return summary


# %%
pdf_path = "sy.pdf"


# %%
(text, l) = extract_text_from_pdf(pdf_path)
print(l)
summary = summarize_text(text, num_sentences=30)

# %%
print(summary)

# %%
for sentence in summary[1:]:
    print(sentence)

Enter fullscreen mode Exit fullscreen mode

AWS Q Developer image

Your AI Code Assistant

Automate your code reviews. Catch bugs before your coworkers. Fix security issues in your code. Built to handle large projects, Amazon Q Developer works alongside you from idea to production code.

Get started free in your IDE

Top comments (0)

Billboard image

Create up to 10 Postgres Databases on Neon's free plan.

If you're starting a new project, Neon has got your databases covered. No credit cards. No trials. No getting in your way.

Try Neon for Free →

👋 Kindness is contagious

Please leave a ❤️ or a friendly comment on this post if you found it helpful!

Okay