DEV Community

Anar
Anar

Posted on

Evaluate ‘Ember’ Embeddings with Llamaindex and AzureOpenAI

Today we have embedding models more than ever before. But the crucial issue here is how to evaluate them or your model to decide which one to use. There are some ways to evaluate these models but they can take so much time and may need GPU for evaluating on datasets. One main example is MTEB. But it takes so much power and time to evaluate, we usually do this to get score to publish our model on MTEB leaderboard.

Here we will use one another way with the help of AzureOpenAI ( we used it because it is faster and more stable than OpenAI itself ). We will use the Ember Model from LLMRails which is top 3rd on embedding leaderboard. We are inspired by Llamaindex eval.

Install test pdf file which is llama2 pdf in our case

!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
Enter fullscreen mode Exit fullscreen mode

Declare your LLM and parse pdf to nodes.

import re
import requests
import pickle
import pandas as pd
from typing import Tuple, List
from llama_index import  ServiceContext
from llama_index.llms import AzureOpenAI
from llama_hub.file.pymu_pdf.base import PyMuPDFReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import BaseNode
from llama_index.prompts import (
    ChatMessage,
    ChatPromptTemplate,
    MessageRole,
    PromptTemplate,
)

loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")

llm = AzureOpenAI(api_base='your api url',api_key='your api key',
    api_version="2023-05-15", model='gpt-35-turbo-16k',deployment_name='gpt3516k',request_timeout=1200,)
service_context = ServiceContext.from_defaults(llm=llm)

node_parser = SimpleNodeParser.from_defaults(chunk_size=1024)
nodes = node_parser.get_nodes_from_documents(documents)
Enter fullscreen mode Exit fullscreen mode

Generate Question&Answer Pairs

QA_PROMPT = PromptTemplate(
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)

def generate_answers_for_questions(
    questions: List[str], context: str, llm
) -> str:
    """Generate answers for questions given context."""
    answers = []
    for question in questions:
        fmt_qa_prompt = QA_PROMPT.format(
            context_str=context, query_str=question
        )
        response_obj = llm.complete(fmt_qa_prompt)
        answers.append(str(response_obj))
    return answers

QUESTION_GEN_USER_TMPL = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "generate the relevant questions. "
)

QUESTION_GEN_SYS_TMPL = """\
You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided.\
"""

question_gen_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=QUESTION_GEN_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=QUESTION_GEN_USER_TMPL),
    ]
)

def generate_qa_pairs(nodes: List[BaseNode], llm ) -> List[Tuple[str, str]]:
    """Generate questions."""
    qa_pairs = []
    for idx, node in enumerate(nodes[:10]):
        print(f"Node {idx}/{len(nodes)}")
        context_str = node.get_content(metadata_mode="all")
        fmt_messages = question_gen_template.format_messages(
            num_questions_per_chunk=10,
            context_str=context_str,
        )
        chat_response = llm.chat(fmt_messages)
        raw_output = chat_response.message.content
        result_list = str(raw_output).strip().split("\n")
        cleaned_questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip()
            for question in result_list
        ]
        answers = generate_answers_for_questions(
            cleaned_questions, context_str, llm
        )
        cur_qa_pairs = list(zip(cleaned_questions, answers))
        qa_pairs.extend(cur_qa_pairs)
    return qa_pairs


qa_pairs = generate_qa_pairs(nodes, llm )
Enter fullscreen mode Exit fullscreen mode

Run correctness evaluation. Here we will use Ember model API instead of directly pulling and using it. Because we want it to be faster and to be able to run on much more cheap device with low CPU resources. First go and create your data store at https://console.llmrails.com/ then upload your llama 2 pdf file to that datastore. Do not forget to get your API key.

CORRECTNESS_SYS_TMPL = """
You are an expert evaluation system for a question answering chatbot.

You are given the following information:
- a user query, 
- a reference answer, and
- a generated answer.

Your job is to judge the relevance and correctness of the generated answer.
Output a single score that represents a holistic evaluation.
You must return your response in a line with only the score.
Do not return answers in any other format.
On a separate line provide your reasoning for the score as well.

Follow these guidelines for scoring:
- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
- If the generated answer is not relevant to the user query, \
you should give a score of 1.
- If the generated answer is relevant but contains mistakes, \
you should give a score between 2 and 3.
- If the generated answer is relevant and fully correct, \
you should give a score between 4 and 5.
"""

CORRECTNESS_USER_TMPL = """
## User Query
{query}

## Reference Answer
{reference_answer}

## Generated Answer
{generated_answer}
"""

eval_chat_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=CORRECTNESS_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=CORRECTNESS_USER_TMPL),
    ]
)

def run_correctness_eval(
    query_str: str,
    reference_answer: str,
    generated_answer: str,
    llm,
    threshold: float = 4.0,
):
    """Run correctness eval."""
    fmt_messages = eval_chat_template.format_messages(
        llm=llm,
        query=query_str,
        reference_answer=reference_answer,
        generated_answer=generated_answer,
    )
    chat_response = llm.chat(fmt_messages)
    raw_output = chat_response.message.content

    # Extract from response
    score_str, reasoning_str = raw_output.split("\n", 1)
    score = float(score_str)
    reasoning = reasoning_str.lstrip("\n")

    return {"passing": score >= threshold, "score": score, "reason": reasoning}


def generate_answer(query):
    response = requests.post('https://api.llmrails.com/v1/datastores/{datastore_id}/search',
        headers={'X-API-KEY':'api_b491e084b40d4ec292536bcf5c851f2d'},
        json={'text':query, 'summarize':True}
    )

    return response.json()['summarization']



def run_evals(qa_pairs: List[Tuple[str, str]], llm):
    results_list = []
    for question, reference_answer in qa_pairs:
        response = generate_answer(question)
        generated_answer = str(response)

        correctness_results = run_correctness_eval(
            question,
            reference_answer,
            generated_answer,
            llm=llm,
            threshold=4.0,
        )

        cur_result_dict = {"correctness": correctness_results["passing"]}

        results_list.append(cur_result_dict)

    return pd.DataFrame(results_list)

evals_df = run_evals(qa_pairs, llm)
print(evals_df["correctness"].mean())
Enter fullscreen mode Exit fullscreen mode

Tada here is your evaluation method which is faster and cheaper. You can use it with any of embedding model you want just changing generate answer method.

Top comments (0)