DEV Community


Posted on


Extractive summarization of PDF files with Spacy

Here is a simple script to accomplish this task :

In your vitual environment run :

pip install PyPDF2
pip install spacy
python -m spacy download fr_core_news_sm
Enter fullscreen mode Exit fullscreen mode
import PyPDF2
import spacy
from import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import re

def extract_text_from_pdf(file_path):
    """Extracts text from a PDF file."""
    pdf_file_obj = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        page_obj = pdf_reader.pages[page_num]
        text += page_obj.extract_text()
    return text

def summarize(text, ratio=0.0013):
    """Summarizes the given text using SpaCy."""
    nlp = spacy.load('fr_core_news_sm')
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    word_frequencies = {}
    for word in tokens:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
            word_frequencies[word] += 1

    max_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word] / max_frequency

    sentence_scores = {}
    for sent in doc.sents:
        for word_value in sent:
            if word_value.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word_value.text.lower()]
                    sentence_scores[sent] += word_frequencies[word_value.text.lower()]

    select_length = int(len(sentence_scores) * ratio)
    summary_sentences = nlargest(select_length, sentence_scores, key=sentence_scores.get)
    final_summary = [str(sentence) for sentence in summary_sentences]
    summary = ' '.join(final_summary)
    return summary

# Path to your PDF file
file_path = 'sy.pdf'

# Extract text from PDF
pdf_text = extract_text_from_pdf(file_path)

summary = summarize(pdf_text )

Enter fullscreen mode Exit fullscreen mode

Sentry image

See why 4M developers consider Sentry, “not bad.”

Fixing code doesn’t have to be the worst part of your day. Learn how Sentry can help.

Learn more

Top comments (0)

Sentry image

See why 4M developers consider Sentry, “not bad.”

Fixing code doesn’t have to be the worst part of your day. Learn how Sentry can help.

Learn more