Here is a simple script to accomplish this task :
In your vitual environment run :
pip install PyPDF2
pip install spacy
python -m spacy download fr_core_news_sm
import PyPDF2
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import re
def extract_text_from_pdf(file_path):
"""Extracts text from a PDF file."""
pdf_file_obj = open(file_path, 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
text = ""
for page_num in range(len(pdf_reader.pages)):
page_obj = pdf_reader.pages[page_num]
text += page_obj.extract_text()
pdf_file_obj.close()
return text
def summarize(text, ratio=0.0013):
"""Summarizes the given text using SpaCy."""
nlp = spacy.load('fr_core_news_sm')
doc = nlp(text)
tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
word_frequencies = {}
for word in tokens:
if word not in word_frequencies.keys():
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
max_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = word_frequencies[word] / max_frequency
sentence_scores = {}
for sent in doc.sents:
for word_value in sent:
if word_value.text.lower() in word_frequencies.keys():
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word_value.text.lower()]
else:
sentence_scores[sent] += word_frequencies[word_value.text.lower()]
select_length = int(len(sentence_scores) * ratio)
summary_sentences = nlargest(select_length, sentence_scores, key=sentence_scores.get)
final_summary = [str(sentence) for sentence in summary_sentences]
summary = ' '.join(final_summary)
return summary
# Path to your PDF file
file_path = 'sy.pdf'
# Extract text from PDF
pdf_text = extract_text_from_pdf(file_path)
summary = summarize(pdf_text )
print("Summary:")
print(summary)
Top comments (0)