DEV Community

Cover image for Ace Your Exams: Automated Question Generation for the Diligent Student
Roomal Seferaj
Roomal Seferaj

Posted on

Ace Your Exams: Automated Question Generation for the Diligent Student

This one goes out to all my students who are always anxious about the upcoming exam.

I see you guys, asking questions such as "What will professor put on the exam?!" It's simple, who cares?

With this script, Ollama will generate line per line multiple choice questions for you to prepare on!

Now, a few words of caution to those that this is an easy way to get an A--it is not.

This is for those who have done the heavy lifting and want to simply take that extra step to be MORE prepared.

Well, this first script is for you guys.

This will generate a multiple choice question with 4 options per document. I would recommend breaking your book down into manageable sections by chapter, that way you don't have to deal with the overhead of parsing a whole book AND generating questions on it! For your consideration:

########################
# LOAD LIBRARIES
########################
import spacy
import warnings
import time
import logging

from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from pdfminer.high_level import extract_text
from tqdm import tqdm

###################################################################################
# SUPPRESS WARNINGS THAT CAN CLUTTER THE OUTPUT, SUCH AS DEPRECATION WARNINGS, ETC.
###################################################################################
warnings.filterwarnings("ignore")

#####################################
# LOAD THE SPACY LANGUAGE MODEL.
#####################################
nlp = spacy.load("en_core_web_sm")

def nest_sentences(document, max_length=4096):
    """
    Break down a document into manageable chunks of sentences where each chunk is under a specified length.

    Parameters:
    - document (str): The input text document to be processed.
    - max_length (int): The maximum character length for each chunk.

    Returns:
    - list: A list where each element is a group of sentences that together are less than max_length characters.
    """
    nested = []  # List to hold all chunks of sentences
    sent = []    # Temporary list to hold sentences for a current chunk
    length = 0   # Counter to keep track of the character length of the current chunk
    doc = nlp(document)  # Process the document using Spacy to tokenize into sentences

    for sentence in doc.sents:
        length += len(sentence.text)
        if length < max_length:
            sent.append(sentence.text)
        else:
            nested.append(' '.join(sent))  # Join sentences in the chunk and add to the nested list
            sent = [sentence.text]  # Start a new chunk with the current sentence
            length = len(sentence.text)  # Reset the length counter to the length of the current sentence

    if sent:  # Don't forget to add the last chunk if it's not empty
        nested.append(' '.join(sent))

    return nested

def generate_summary(text, llm, max_length=4096):
    """
    Generate a summary for provided text using the specified large language model (LLM).

    Parameters:
    - text (str): Text to summarize.
    - llm (LLMChain): The large language model to use for generating summaries.
    - max_length (int): The maximum character length for each summary chunk.

    Returns:
    - str: A single string that is the concatenated summary of all processed chunks.
    """
    sentences = nest_sentences(text, max_length)
    summaries = []  # List to hold summaries of each chunk
    seen_questions = set()  # Set to track unique questions

    prompt_template = PromptTemplate(
        input_variables=["text"],
        template="Generate diverse multiple-choice questions/answer are one sentence long based on the context here: {text}. "
                 "Ensure each question is unique and not repetitive. "
                 "Format:\nQuestion: Question?\n- A) Option A.\n- B) Option B.\n- C) Option C.\n- D) Option D.\nAnswer: Answer\n***\n"
    )

    for chunk in tqdm(sentences, desc="Generating summaries"):
        # Use the LLM to generate the summary based on the prompt.
        prompt = prompt_template.format(text=chunk)
        result = llm.invoke(prompt)
        result_lines = result.strip().split("\n")

        for line in result_lines:
            if line.startswith("Question:"):
                question = line.strip()
                if question not in seen_questions:
                    summaries.append(question)
                    seen_questions.add(question)
                    answer = next((l.strip() for l in result_lines if l.startswith("Answer:")), "")
                    summaries.append(answer)

        # Optionally print each generated summary.
        print(result.strip())

    # Join all summaries into a single string with spaces in between.
    return "\n".join(summaries)

def main_loop(delay):
    """
    Run the main loop, which generates summaries periodically, for 30 minutes.

    Parameters:
    - delay (int): The delay in seconds between each iteration of the loop.
    """
    end_time = time.time() + 30 * 60  # 30 minutes from now
    while time.time() < end_time:
        try:
            # Extract text from a PDF file.
            text = extract_text("/home/roomal/Desktop/PSY-3180/pdfs/Book 1/13 - Reproductive Behavior.pdf")

            # Generate and print the summary for the extracted text.
            summary = generate_summary(text, llm)
            print(summary)
        except Exception as e:
            logging.error(f"An error occurred: {e}")

        # Pause for the specified delay before the next iteration.
        time.sleep(delay)

#####################################
# CONFIGURATION FOR THE LANGUAGE MODEL.
#####################################
llm = Ollama(model="llama3:latest", temperature=0.9)

#######################################
# RUN THE MAIN LOOP FOR 30 MINUTES
#######################################
if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    logging.info("Starting the main loop for 30 minutes... Or whatever.")
    delay = int(input("Enter the delay time in seconds between each iteration: "))
    main_loop(delay)
    logging.info("Main loop completed.")
Enter fullscreen mode Exit fullscreen mode

This code automates the process of scraping Wikipedia for a specified topic, processing the text, identifying topics using BERTopic, and generating questions based on these topics. It begins by installing necessary packages and setting up logging. The scrape_wikipedia function collects content from Wikipedia pages related to a given topic. The text is then cleaned, tokenized, and stopwords are removed. Using UMAP and CountVectorizer, the text is vectorized, and BERTopic is employed to identify key topics, with further fine-tuning using the Ollama model. Finally, questions are generated for each topic using a language model and saved for future use. This workflow is ideal for creating educational content, facilitating research, and enhancing study materials.

# pip install wikipedia-api bertopic umap-learn pandas nltk pdfminer.six tqdm rich langchain

import wikipediaapi
import pandas as pd
import concurrent.futures
from tqdm import tqdm
import json
import nltk
import re
import time
import umap
import logging
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from langchain import PromptTemplate
from langchain.llms import Ollama
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from rich.progress import Progress
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')
nltk.download('stopwords')

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def scrape_wikipedia(name_topic, verbose=True, max_workers=5):
    def link_to_wikipedia(link):
        try:
            page = api_wikipedia.page(link)
            if page.exists():
                return {
                    "page": link,
                    "text": page.text,
                    "link": page.fullurl,
                    "categories": list(page.categories.keys()),
                }
        except Exception as e:
            print(f"Error processing {link}: {e}")
            return None

    api_wikipedia = wikipediaapi.Wikipedia(
        language="en",
        user_agent="YourUserAgentHere",
        extract_format=wikipediaapi.ExtractFormat.WIKI,
    )

    name_of_page = api_wikipedia.page(name_topic)
    if not name_of_page.exists():
        print(f"Page {name_topic} is not present")
        return

    links_to_page = list(name_of_page.links.keys())
    procceed = tqdm(desc="Scraped links", unit="", total=len(links_to_page)) if verbose else None

    origin = [{
        "page": name_topic,
        "text": name_of_page.text,
        "link": name_of_page.fullurl,
        "categories": list(name_of_page.categories.keys()),
    }]

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        links_future = {executor.submit(link_to_wikipedia, link): link for link in links_to_page}
        for future in concurrent.futures.as_completed(links_future):
            info = future.result()
            if info:
                origin.append(info)
            if verbose:
                procceed.update(1)
    if verbose:
        procceed.close()

    # Define namespaces to exclude
    namespaces = (
        "Wikipedia", "Special", "Talk", "LyricWiki", "File", "MediaWiki",
        "Template", "Help", "User", "Category talk", "Portal talk"
    )

    # Create DataFrame and filter based on text length and namespaces
    origin_df = pd.DataFrame(origin)
    origin_df = origin_df[
        (origin_df["text"].str.len() > 20) & 
        (~origin_df["page"].str.startswith(namespaces, na=True))
    ]

    # Process categories to remove 'Category:' prefix
    origin_df["categories"] = origin_df["categories"].apply(lambda cats: [cat[9:] for cat in cats])

    origin_df["topic"] = name_topic
    print("Scraped pages:", len(origin_df))

    return origin_df

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^a-z0-9\s.]', '', text)  # Remove non-alphanumeric characters except periods
    return text

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    return filtered_tokens

def preprocess_text(text):
    cleaned_text = clean_text(text)
    tokens = tokenize_text(cleaned_text)
    filtered_tokens = remove_stopwords(tokens)
    return ' '.join(filtered_tokens)

def generate_questions(context, llm, prompt_template):
    formatted_prompt = prompt_template.format(context=context)
    response = llm.invoke(formatted_prompt)
    return response.strip()

# Start the process
logging.info("Starting the process...")

# 1. Scrape Wikipedia for the specified topic
topic = input("Enter the Wikipedia topic to scrape: ")
data = scrape_wikipedia(topic)
if data is None or data.empty:
    logging.error(f"No data found for topic: {topic}")
    exit()

# Save the scraped data to a CSV file
data.to_csv("/home/roomal/Desktop/scraped_data.csv", index=False)
logging.info("Scraped data saved to CSV.")

# Convert the scraped text data into a single text format
text_data = " ".join(data["text"].tolist())

# 2. Preprocess the text
processed_text = preprocess_text(text_data)
logging.info("Text preprocessing complete.")

# 3. Tokenize the text into sentences
sentences = sent_tokenize(processed_text)
logging.info(f"Number of sentences extracted: {len(sentences)}")

# 4. Configure UMAP and vectorizer parameters
umap_model = umap.UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='cosine')
vectorizer_model = CountVectorizer(min_df=1, max_df=0.95)

# 5. Fit BERTopic model
logging.info("Fitting BERTopic model...")
topic_model = BERTopic(
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    representation_model=KeyBERTInspired()
)
with Progress() as progress:
    task = progress.add_task("Fitting BERTopic model...", total=len(sentences))
    topics, probs = topic_model.fit_transform(sentences)
    progress.update(task, advance=len(sentences))

logging.info("BERTopic model fitted successfully.")
topic_info = topic_model.get_topic_info()

# Save the topic information to a CSV file
topic_info.to_csv("/home/roomal/Desktop/topic_info.csv", index=False)
logging.info("Topic information saved to CSV.")

# 6. Fine-tune topic representations with Ollama
logging.info("Fine-tuning topic representations with Ollama...")
topic_representation_model = Ollama(model="mistral:latest", temperature=0.8)
topic_model = BERTopic(
    representation_model=topic_representation_model,
    umap_model=umap_model,
    vectorizer_model=vectorizer_model
)
with Progress() as progress:
    task = progress.add_task("Fitting BERTopic with Ollama...", total=len(sentences))
    topic_model.fit(sentences)
    progress.update(task, advance=len(sentences))

logging.info("Topic representations fine-tuned successfully.")

# 7. Get representative documents for each topic
docs_by_topic = topic_model.get_representative_docs()

# 8. Generate questions using LangChain
llm_for_questions = Ollama(model="llama3:latest", temperature=0.5)

# Define a more varied prompt template for generating questions
prompt_template = PromptTemplate(
    input_variables=['context'],
    template='''Generate several insightful and varied questions based on the context below:

Context: {context}

Questions:\n
1. '''
)

# Generate questions for each topic
questions = []
with Progress() as progress:
    task = progress.add_task("Generating questions...", total=len(docs_by_topic))
    for topic, docs in docs_by_topic.items():
        context = " ".join(docs)
        question = generate_questions(context, llm_for_questions, prompt_template)
        questions.append(question)
        progress.update(task, advance=1)

logging.info("Questions generated successfully.")

# Print the generated questions
for i, question in enumerate(questions):
    print(f"Topic {i+1}: {question}")

# Save the questions to a text file
with open("/home/roomal/Desktop/generated_questions.txt", "w") as f:
    for i, question in enumerate(questions):
        f.write(f"Topic {i+1}:\n{question}\n\n")

logging.info("Generated questions saved to text file.")
Enter fullscreen mode Exit fullscreen mode

This next one is a variation of the script above, except it will generate comprehensive questions, essay questions.

This code automates the extraction and processing of text from a specified PDF, identifies topics within the text using BERTopic, and generates insightful questions based on these topics using a language model. It begins by loading necessary libraries and configuring logging for progress tracking. The text is extracted from the PDF and cleaned, tokenized, and filtered to remove stopwords. The processed text is then analyzed using UMAP for dimensionality reduction and BERTopic for topic modeling. Fine-tuning of topic representations is performed using the Ollama model. Representative documents for each topic are identified, and questions are generated using LangChain with a specified prompt template. The process is repeated in a loop for a defined duration, continuously generating and printing questions, which can be useful for educational and research purposes.

####################
# LOAD LIBRARIES
####################
import json
import nltk
import re
import time
import umap
import logging

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from langchain import PromptTemplate
from langchain.llms import Ollama
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from pdfminer.high_level import extract_text
from rich.progress import Progress
from sklearn.feature_extraction.text import CountVectorizer

####################
# CONFIGURE LOGGING
####################
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

################################################
# ENSURE NECESSARY NLTK RESOURCES ARE DOWNLOADED
################################################
nltk.download('punkt')
nltk.download('stopwords')

######################
# PATH TO THE PDF FILE
######################
file_path = "C:\\Users\\sefer\\OneDrive\\Desktop\\PSY-3180\\pdfs\\Book 1\\13 - Reproductive Behavior.pdf"

############################
# EXTRACT TEXT FROM THE PDF
############################
logging.info("Extracting text from PDF...")
text = extract_text(file_path)

def clean_text(text):
    """Clean the input text using various preprocessing steps."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^a-z0-9\s.]', '', text)  # Remove non-alphanumeric characters except periods
    return text

def tokenize_text(text):
    """Tokenize text into individual words."""
    tokens = word_tokenize(text)
    return tokens

def remove_stopwords(tokens):
    """Remove stopwords from the list of tokenized words."""
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    return filtered_tokens

def preprocess_text(text):
    """Full preprocessing pipeline integrating all the steps."""
    cleaned_text = clean_text(text)
    tokens = tokenize_text(cleaned_text)
    filtered_tokens = remove_stopwords(tokens)
    return ' '.join(filtered_tokens)

def generate_questions(context, llm, prompt_template):
    formatted_prompt = prompt_template.format(context=context)
    response = llm.invoke(formatted_prompt)
    return response.strip()

def main_loop(duration_minutes):
    end_time = time.time() + duration_minutes * 60

    while time.time() < end_time:
        logging.info("Starting new iteration of text processing and question generation...")
        # Preprocess the text
        processed_text = preprocess_text(text)
        logging.info("Text preprocessing complete.")

        # Tokenize the text into sentences
        sentences = sent_tokenize(processed_text)
        logging.info(f"Number of sentences extracted: {len(sentences)}")

        # Configure UMAP parameters
        umap_model = umap.UMAP(n_neighbors=10, n_components=5, min_dist=0.1, metric='cosine')
        vectorizer_model = CountVectorizer(min_df=1, max_df=0.95)

        # BERTopic model
        logging.info("Fitting BERTopic model...")
        topic_model = BERTopic(
            umap_model=umap_model,
            vectorizer_model=vectorizer_model,
            representation_model=KeyBERTInspired()
        )
        with Progress() as progress:
            task = progress.add_task("Fitting BERTopic model...", total=len(sentences))
            topics, probs = topic_model.fit_transform(sentences)
            progress.update(task, advance=len(sentences))

        logging.info("BERTopic model fitted successfully.")
        topic_model.get_topic_info()

        # Fine-tune topic representations with Ollama
        logging.info("Fine-tuning topic representations with Ollama...")
        topic_representation_model = Ollama(model="llama3:latest", temperature=0.8)
        topic_model = BERTopic(
            representation_model=topic_representation_model,
            umap_model=umap_model,
            vectorizer_model=vectorizer_model
        )
        with Progress() as progress:
            task = progress.add_task("Fitting BERTopic with Ollama...", total=len(sentences))
            topic_model.fit(sentences)
            progress.update(task, advance=len(sentences))

        logging.info("Topic representations fine-tuned successfully.")
        # Get representative documents for each topic
        docs_by_topic = topic_model.get_representative_docs()

        # Generate questions using LangChain
        llm_for_questions = Ollama(model="llama3:latest", temperature=0.5)

        # Define a more varied prompt template for generating questions
        prompt_template = PromptTemplate(
            input_variables=['context'],
            template='''Generate several insightful and varied questions based on the context below:

Context: {context}

Questions:\n
1. '''
        )

        # Generate questions for each topic
        questions = []
        with Progress() as progress:
            task = progress.add_task("Generating questions...", total=len(docs_by_topic))
            for topic, docs in docs_by_topic.items():
                context = " ".join(docs)
                question = generate_questions(context, llm_for_questions, prompt_template)
                questions.append(question)
                progress.update(task, advance=1)

        logging.info("Questions generated successfully.")
        # Print the generated questions
        for i, question in enumerate(questions):
            print(f"Topic {i+1}: {question}")

        # Pause for a moment before the next iteration
        time.sleep(1)

#######################################
# RUN THE MAIN LOOP FOR 30 MINUTES
#######################################
if __name__ == '__main__':
    logging.info("Starting the main loop for 30 minutes...Or whatever.")
    main_loop(30)
    logging.info("Main loop completed.")
Enter fullscreen mode Exit fullscreen mode

I have many more scripts to help my fellow students, just let me know what you wanna see, what ideas are rattling in that brilliant mind, and I will try to to make it happen.

For my next post, I'm going to show some hacking scripts I have developed to make things easier during an engagement.

Best,

Roomal

Top comments (0)