DEV Community

artydev
artydev

Posted on

1

Extractive summarization of PDF files with Spacy

Here is a simple script to accomplish this task :

In your vitual environment run :

pip install PyPDF2
pip install spacy
python -m spacy download fr_core_news_sm
Enter fullscreen mode Exit fullscreen mode
import PyPDF2
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import re

def extract_text_from_pdf(file_path):
    """Extracts text from a PDF file."""
    pdf_file_obj = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        page_obj = pdf_reader.pages[page_num]
        text += page_obj.extract_text()
    pdf_file_obj.close()
    return text

def summarize(text, ratio=0.0013):
    """Summarizes the given text using SpaCy."""
    nlp = spacy.load('fr_core_news_sm')
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    word_frequencies = {}
    for word in tokens:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

    max_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word] / max_frequency

    sentence_scores = {}
    for sent in doc.sents:
        for word_value in sent:
            if word_value.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word_value.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word_value.text.lower()]

    select_length = int(len(sentence_scores) * ratio)
    summary_sentences = nlargest(select_length, sentence_scores, key=sentence_scores.get)
    final_summary = [str(sentence) for sentence in summary_sentences]
    summary = ' '.join(final_summary)
    return summary

# Path to your PDF file
file_path = 'sy.pdf'

# Extract text from PDF
pdf_text = extract_text_from_pdf(file_path)



summary = summarize(pdf_text )
print("Summary:")
print(summary)

Enter fullscreen mode Exit fullscreen mode

AWS GenAI LIVE image

Real challenges. Real solutions. Real talk.

From technical discussions to philosophical debates, AWS and AWS Partners examine the impact and evolution of gen AI.

Learn more

Top comments (0)

Postmark Image

Speedy emails, satisfied customers

Are delayed transactional emails costing you user satisfaction? Postmark delivers your emails almost instantly, keeping your customers happy and connected.

Sign up

👋 Kindness is contagious

Please leave a ❤️ or a friendly comment on this post if you found it helpful!

Okay