This one goes out to all my students who are always anxious about the upcoming exam.
I see you guys, asking questions such as "What will professor put on the exam?!" It's simple, who cares?
With this script, Ollama will generate line per line multiple choice questions for you to prepare on!
Now, a few words of caution to those that this is an easy way to get an A--it is not.
This is for those who have done the heavy lifting and want to simply take that extra step to be MORE prepared.
Well, this first script is for you guys.
This will generate a multiple choice question with 4 options per document. I would recommend breaking your book down into manageable sections by chapter, that way you don't have to deal with the overhead of parsing a whole book AND generating questions on it! For your consideration:
########################
# LOAD LIBRARIES
########################
import spacy
import warnings
import time
import logging
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from pdfminer.high_level import extract_text
from tqdm import tqdm
###################################################################################
# SUPPRESS WARNINGS THAT CAN CLUTTER THE OUTPUT, SUCH AS DEPRECATION WARNINGS, ETC.
###################################################################################
warnings.filterwarnings("ignore")
#####################################
# LOAD THE SPACY LANGUAGE MODEL.
#####################################
nlp = spacy.load("en_core_web_sm")
def nest_sentences(document, max_length=4096):
"""
Break down a document into manageable chunks of sentences where each chunk is under a specified length.
Parameters:
- document (str): The input text document to be processed.
- max_length (int): The maximum character length for each chunk.
Returns:
- list: A list where each element is a group of sentences that together are less than max_length characters.
"""
nested = [] # List to hold all chunks of sentences
sent = [] # Temporary list to hold sentences for a current chunk
length = 0 # Counter to keep track of the character length of the current chunk
doc = nlp(document) # Process the document using Spacy to tokenize into sentences
for sentence in doc.sents:
length += len(sentence.text)
if length < max_length:
sent.append(sentence.text)
else:
nested.append(' '.join(sent)) # Join sentences in the chunk and add to the nested list
sent = [sentence.text] # Start a new chunk with the current sentence
length = len(sentence.text) # Reset the length counter to the length of the current sentence
if sent: # Don't forget to add the last chunk if it's not empty
nested.append(' '.join(sent))
return nested
def generate_summary(text, llm, max_length=4096):
"""
Generate a summary for provided text using the specified large language model (LLM).
Parameters:
- text (str): Text to summarize.
- llm (LLMChain): The large language model to use for generating summaries.
- max_length (int): The maximum character length for each summary chunk.
Returns:
- str: A single string that is the concatenated summary of all processed chunks.
"""
sentences = nest_sentences(text, max_length)
summaries = [] # List to hold summaries of each chunk
seen_questions = set() # Set to track unique questions
prompt_template = PromptTemplate(
input_variables=["text"],
template="Generate diverse multiple-choice questions/answer are one sentence long based on the context here: {text}. "
"Ensure each question is unique and not repetitive. "
"Format:\nQuestion: Question?\n- A) Option A.\n- B) Option B.\n- C) Option C.\n- D) Option D.\nAnswer: Answer\n***\n"
)
for chunk in tqdm(sentences, desc="Generating summaries"):
# Use the LLM to generate the summary based on the prompt.
prompt = prompt_template.format(text=chunk)
result = llm.invoke(prompt)
result_lines = result.strip().split("\n")
for line in result_lines:
if line.startswith("Question:"):
question = line.strip()
if question not in seen_questions:
summaries.append(question)
seen_questions.add(question)
answer = next((l.strip() for l in result_lines if l.startswith("Answer:")), "")
summaries.append(answer)
# Optionally print each generated summary.
print(result.strip())
# Join all summaries into a single string with spaces in between.
return "\n".join(summaries)
def main_loop(delay):
"""
Run the main loop, which generates summaries periodically, for 30 minutes.
Parameters:
- delay (int): The delay in seconds between each iteration of the loop.
"""
end_time = time.time() + 30 * 60 # 30 minutes from now
while time.time() < end_time:
try:
# Extract text from a PDF file.
text = extract_text("/home/roomal/Desktop/PSY-3180/pdfs/Book 1/13 - Reproductive Behavior.pdf")
# Generate and print the summary for the extracted text.
summary = generate_summary(text, llm)
print(summary)
except Exception as e:
logging.error(f"An error occurred: {e}")
# Pause for the specified delay before the next iteration.
time.sleep(delay)
#####################################
# CONFIGURATION FOR THE LANGUAGE MODEL.
#####################################
llm = Ollama(model="llama3:latest", temperature=0.9)
#######################################
# RUN THE MAIN LOOP FOR 30 MINUTES
#######################################
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO)
logging.info("Starting the main loop for 30 minutes... Or whatever.")
delay = int(input("Enter the delay time in seconds between each iteration: "))
main_loop(delay)
logging.info("Main loop completed.")
This code automates the process of scraping Wikipedia for a specified topic, processing the text, identifying topics using BERTopic, and generating questions based on these topics. It begins by installing necessary packages and setting up logging. The scrape_wikipedia
function collects content from Wikipedia pages related to a given topic. The text is then cleaned, tokenized, and stopwords are removed. Using UMAP and CountVectorizer, the text is vectorized, and BERTopic is employed to identify key topics, with further fine-tuning using the Ollama model. Finally, questions are generated for each topic using a language model and saved for future use. This workflow is ideal for creating educational content, facilitating research, and enhancing study materials.
# pip install wikipedia-api bertopic umap-learn pandas nltk pdfminer.six tqdm rich langchain
import wikipediaapi
import pandas as pd
import concurrent.futures
from tqdm import tqdm
import json
import nltk
import re
import time
import umap
import logging
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from langchain import PromptTemplate
from langchain.llms import Ollama
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from rich.progress import Progress
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')
nltk.download('stopwords')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def scrape_wikipedia(name_topic, verbose=True, max_workers=5):
def link_to_wikipedia(link):
try:
page = api_wikipedia.page(link)
if page.exists():
return {
"page": link,
"text": page.text,
"link": page.fullurl,
"categories": list(page.categories.keys()),
}
except Exception as e:
print(f"Error processing {link}: {e}")
return None
api_wikipedia = wikipediaapi.Wikipedia(
language="en",
user_agent="YourUserAgentHere",
extract_format=wikipediaapi.ExtractFormat.WIKI,
)
name_of_page = api_wikipedia.page(name_topic)
if not name_of_page.exists():
print(f"Page {name_topic} is not present")
return
links_to_page = list(name_of_page.links.keys())
procceed = tqdm(desc="Scraped links", unit="", total=len(links_to_page)) if verbose else None
origin = [{
"page": name_topic,
"text": name_of_page.text,
"link": name_of_page.fullurl,
"categories": list(name_of_page.categories.keys()),
}]
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
links_future = {executor.submit(link_to_wikipedia, link): link for link in links_to_page}
for future in concurrent.futures.as_completed(links_future):
info = future.result()
if info:
origin.append(info)
if verbose:
procceed.update(1)
if verbose:
procceed.close()
# Define namespaces to exclude
namespaces = (
"Wikipedia", "Special", "Talk", "LyricWiki", "File", "MediaWiki",
"Template", "Help", "User", "Category talk", "Portal talk"
)
# Create DataFrame and filter based on text length and namespaces
origin_df = pd.DataFrame(origin)
origin_df = origin_df[
(origin_df["text"].str.len() > 20) &
(~origin_df["page"].str.startswith(namespaces, na=True))
]
# Process categories to remove 'Category:' prefix
origin_df["categories"] = origin_df["categories"].apply(lambda cats: [cat[9:] for cat in cats])
origin_df["topic"] = name_topic
print("Scraped pages:", len(origin_df))
return origin_df
def clean_text(text):
text = text.lower()
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
text = re.sub(r'[^a-z0-9\s.]', '', text) # Remove non-alphanumeric characters except periods
return text
def tokenize_text(text):
tokens = word_tokenize(text)
return tokens
def remove_stopwords(tokens):
filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
return filtered_tokens
def preprocess_text(text):
cleaned_text = clean_text(text)
tokens = tokenize_text(cleaned_text)
filtered_tokens = remove_stopwords(tokens)
return ' '.join(filtered_tokens)
def generate_questions(context, llm, prompt_template):
formatted_prompt = prompt_template.format(context=context)
response = llm.invoke(formatted_prompt)
return response.strip()
# Start the process
logging.info("Starting the process...")
# 1. Scrape Wikipedia for the specified topic
topic = input("Enter the Wikipedia topic to scrape: ")
data = scrape_wikipedia(topic)
if data is None or data.empty:
logging.error(f"No data found for topic: {topic}")
exit()
# Save the scraped data to a CSV file
data.to_csv("/home/roomal/Desktop/scraped_data.csv", index=False)
logging.info("Scraped data saved to CSV.")
# Convert the scraped text data into a single text format
text_data = " ".join(data["text"].tolist())
# 2. Preprocess the text
processed_text = preprocess_text(text_data)
logging.info("Text preprocessing complete.")
# 3. Tokenize the text into sentences
sentences = sent_tokenize(processed_text)
logging.info(f"Number of sentences extracted: {len(sentences)}")
# 4. Configure UMAP and vectorizer parameters
umap_model = umap.UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='cosine')
vectorizer_model = CountVectorizer(min_df=1, max_df=0.95)
# 5. Fit BERTopic model
logging.info("Fitting BERTopic model...")
topic_model = BERTopic(
umap_model=umap_model,
vectorizer_model=vectorizer_model,
representation_model=KeyBERTInspired()
)
with Progress() as progress:
task = progress.add_task("Fitting BERTopic model...", total=len(sentences))
topics, probs = topic_model.fit_transform(sentences)
progress.update(task, advance=len(sentences))
logging.info("BERTopic model fitted successfully.")
topic_info = topic_model.get_topic_info()
# Save the topic information to a CSV file
topic_info.to_csv("/home/roomal/Desktop/topic_info.csv", index=False)
logging.info("Topic information saved to CSV.")
# 6. Fine-tune topic representations with Ollama
logging.info("Fine-tuning topic representations with Ollama...")
topic_representation_model = Ollama(model="mistral:latest", temperature=0.8)
topic_model = BERTopic(
representation_model=topic_representation_model,
umap_model=umap_model,
vectorizer_model=vectorizer_model
)
with Progress() as progress:
task = progress.add_task("Fitting BERTopic with Ollama...", total=len(sentences))
topic_model.fit(sentences)
progress.update(task, advance=len(sentences))
logging.info("Topic representations fine-tuned successfully.")
# 7. Get representative documents for each topic
docs_by_topic = topic_model.get_representative_docs()
# 8. Generate questions using LangChain
llm_for_questions = Ollama(model="llama3:latest", temperature=0.5)
# Define a more varied prompt template for generating questions
prompt_template = PromptTemplate(
input_variables=['context'],
template='''Generate several insightful and varied questions based on the context below:
Context: {context}
Questions:\n
1. '''
)
# Generate questions for each topic
questions = []
with Progress() as progress:
task = progress.add_task("Generating questions...", total=len(docs_by_topic))
for topic, docs in docs_by_topic.items():
context = " ".join(docs)
question = generate_questions(context, llm_for_questions, prompt_template)
questions.append(question)
progress.update(task, advance=1)
logging.info("Questions generated successfully.")
# Print the generated questions
for i, question in enumerate(questions):
print(f"Topic {i+1}: {question}")
# Save the questions to a text file
with open("/home/roomal/Desktop/generated_questions.txt", "w") as f:
for i, question in enumerate(questions):
f.write(f"Topic {i+1}:\n{question}\n\n")
logging.info("Generated questions saved to text file.")
This next one is a variation of the script above, except it will generate comprehensive questions, essay questions.
This code automates the extraction and processing of text from a specified PDF, identifies topics within the text using BERTopic, and generates insightful questions based on these topics using a language model. It begins by loading necessary libraries and configuring logging for progress tracking. The text is extracted from the PDF and cleaned, tokenized, and filtered to remove stopwords. The processed text is then analyzed using UMAP for dimensionality reduction and BERTopic for topic modeling. Fine-tuning of topic representations is performed using the Ollama model. Representative documents for each topic are identified, and questions are generated using LangChain with a specified prompt template. The process is repeated in a loop for a defined duration, continuously generating and printing questions, which can be useful for educational and research purposes.
####################
# LOAD LIBRARIES
####################
import json
import nltk
import re
import time
import umap
import logging
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from langchain import PromptTemplate
from langchain.llms import Ollama
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from pdfminer.high_level import extract_text
from rich.progress import Progress
from sklearn.feature_extraction.text import CountVectorizer
####################
# CONFIGURE LOGGING
####################
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
################################################
# ENSURE NECESSARY NLTK RESOURCES ARE DOWNLOADED
################################################
nltk.download('punkt')
nltk.download('stopwords')
######################
# PATH TO THE PDF FILE
######################
file_path = "C:\\Users\\sefer\\OneDrive\\Desktop\\PSY-3180\\pdfs\\Book 1\\13 - Reproductive Behavior.pdf"
############################
# EXTRACT TEXT FROM THE PDF
############################
logging.info("Extracting text from PDF...")
text = extract_text(file_path)
def clean_text(text):
"""Clean the input text using various preprocessing steps."""
text = text.lower()
text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
text = re.sub(r'[^a-z0-9\s.]', '', text) # Remove non-alphanumeric characters except periods
return text
def tokenize_text(text):
"""Tokenize text into individual words."""
tokens = word_tokenize(text)
return tokens
def remove_stopwords(tokens):
"""Remove stopwords from the list of tokenized words."""
filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
return filtered_tokens
def preprocess_text(text):
"""Full preprocessing pipeline integrating all the steps."""
cleaned_text = clean_text(text)
tokens = tokenize_text(cleaned_text)
filtered_tokens = remove_stopwords(tokens)
return ' '.join(filtered_tokens)
def generate_questions(context, llm, prompt_template):
formatted_prompt = prompt_template.format(context=context)
response = llm.invoke(formatted_prompt)
return response.strip()
def main_loop(duration_minutes):
end_time = time.time() + duration_minutes * 60
while time.time() < end_time:
logging.info("Starting new iteration of text processing and question generation...")
# Preprocess the text
processed_text = preprocess_text(text)
logging.info("Text preprocessing complete.")
# Tokenize the text into sentences
sentences = sent_tokenize(processed_text)
logging.info(f"Number of sentences extracted: {len(sentences)}")
# Configure UMAP parameters
umap_model = umap.UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='cosine')
vectorizer_model = CountVectorizer(min_df=1, max_df=0.95)
# BERTopic model
logging.info("Fitting BERTopic model...")
topic_model = BERTopic(
umap_model=umap_model,
vectorizer_model=vectorizer_model,
representation_model=KeyBERTInspired()
)
with Progress() as progress:
task = progress.add_task("Fitting BERTopic model...", total=len(sentences))
topics, probs = topic_model.fit_transform(sentences)
progress.update(task, advance=len(sentences))
logging.info("BERTopic model fitted successfully.")
topic_model.get_topic_info()
# Fine-tune topic representations with Ollama
logging.info("Fine-tuning topic representations with Ollama...")
topic_representation_model = Ollama(model="llama3:latest", temperature=0.8)
topic_model = BERTopic(
representation_model=topic_representation_model,
umap_model=umap_model,
vectorizer_model=vectorizer_model
)
with Progress() as progress:
task = progress.add_task("Fitting BERTopic with Ollama...", total=len(sentences))
topic_model.fit(sentences)
progress.update(task, advance=len(sentences))
logging.info("Topic representations fine-tuned successfully.")
# Get representative documents for each topic
docs_by_topic = topic_model.get_representative_docs()
# Generate questions using LangChain
llm_for_questions = Ollama(model="llama3:latest", temperature=0.5)
# Define a more varied prompt template for generating questions
prompt_template = PromptTemplate(
input_variables=['context'],
template='''Generate several insightful and varied questions based on the context below:
Context: {context}
Questions:\n
1. '''
)
# Generate questions for each topic
questions = []
with Progress() as progress:
task = progress.add_task("Generating questions...", total=len(docs_by_topic))
for topic, docs in docs_by_topic.items():
context = " ".join(docs)
question = generate_questions(context, llm_for_questions, prompt_template)
questions.append(question)
progress.update(task, advance=1)
logging.info("Questions generated successfully.")
# Print the generated questions
for i, question in enumerate(questions):
print(f"Topic {i+1}: {question}")
# Pause for a moment before the next iteration
time.sleep(1)
#######################################
# RUN THE MAIN LOOP FOR 30 MINUTES
#######################################
if __name__ == '__main__':
logging.info("Starting the main loop for 30 minutes...Or whatever.")
main_loop(30)
logging.info("Main loop completed.")
I have many more scripts to help my fellow students, just let me know what you wanna see, what ideas are rattling in that brilliant mind, and I will try to to make it happen.
For my next post, I'm going to show some hacking scripts I have developed to make things easier during an engagement.
Best,
Roomal
Top comments (0)