How to benchmark NexusQuant on your own model

#python #llm #machinelearning #opensource

Running benchmarks on someone else's hardware tells you very little. This guide shows you how to measure NexusQuant's impact on your model, your data, and your hardware in under 15 minutes.

Prerequisites

pip install nexusquant-kv transformers torch datasets

You need a HuggingFace causal LM (any model using split-half RoPE - that's every Llama, Mistral, Qwen, and Phi variant since 2023).

Step 1: Load your model

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "mistralai/Mistral-7B-v0.1"  # replace with yours

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.eval()

If you are on a smaller GPU, use load_in_8bit=True or try a quantized checkpoint. The benchmark logic is the same.

Step 2: Compute baseline perplexity

Perplexity (PPL) is the standard quality metric for language models. Lower is better. We measure it on a fixed text corpus so results are reproducible.

import math
from datasets import load_dataset

def compute_ppl(model, tokenizer, text: str, stride: int = 512) -> float:
    """Sliding-window perplexity - handles texts longer than the context window."""
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings.input_ids.to(model.device)
    seq_len = input_ids.size(1)
    max_len = model.config.max_position_embeddings

    nlls = []
    prev_end = 0
    for begin in range(0, seq_len, stride):
        end = min(begin + max_len, seq_len)
        target_len = end - prev_end
        input_chunk = input_ids[:, begin:end]
        target_chunk = input_chunk.clone()
        target_chunk[:, :-target_len] = -100  # mask context

        with torch.no_grad():
            loss = model(input_chunk, labels=target_chunk).loss
        nlls.append(loss.item() * target_len)
        prev_end = end
        if end == seq_len:
            break

    return math.exp(sum(nlls) / seq_len)


# Use the first 10 000 tokens of wikitext-2 as a fixed benchmark corpus
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
corpus = " ".join(dataset["text"][:200])  # ~10K tokens

baseline_ppl = compute_ppl(model, tokenizer, corpus)
print(f"Baseline PPL: {baseline_ppl:.4f}")

Step 3: Apply NexusQuant at each preset and re-measure

from nexusquant import nexusquant_evict

for preset in ["high", "balanced", "max"]:
    # NexusQuant hooks into the model's attention layers for the duration of the `with` block
    with nexusquant_evict(model, quality=preset):
        nq_ppl = compute_ppl(model, tokenizer, corpus)

    compression = {"high": "10x", "balanced": "17x", "max": "33x"}[preset]
    delta = ((nq_ppl - baseline_ppl) / baseline_ppl) * 100
    print(f"[{preset:8s}] compression={compression}  PPL={nq_ppl:.4f}  delta={delta:+.2f}%")

Expected output on Mistral-7B:

[high    ] compression=10x  PPL=...  delta=+0.4%
[balanced] compression=17x  PPL=...  delta=+1.3%
[max     ] compression=33x  PPL=...  delta=+2.6%

Step 4: Test on your own text

Replace corpus with any string you care about:

with open("my_domain_corpus.txt") as f:
    my_text = f.read()

for preset in ["high", "balanced", "max"]:
    with nexusquant_evict(model, quality=preset):
        ppl = compute_ppl(model, tokenizer, my_text)
    print(f"{preset}: {ppl:.4f}")

Domain-specific text (legal, medical, code) may show different degradation curves than wikitext. Always benchmark on text representative of your actual use case.

Full script (copy-paste ready)

import math
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from nexusquant import nexusquant_evict

MODEL_NAME = "mistralai/Mistral-7B-v0.1"  # change this

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16, device_map="auto"
)
model.eval()

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
corpus = " ".join(dataset["text"][:200])


def compute_ppl(model, tokenizer, text, stride=512):
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings.input_ids.to(model.device)
    seq_len = input_ids.size(1)
    max_len = model.config.max_position_embeddings
    nlls, prev_end = [], 0
    for begin in range(0, seq_len, stride):
        end = min(begin + max_len, seq_len)
        target_len = end - prev_end
        chunk = input_ids[:, begin:end]
        labels = chunk.clone()
        labels[:, :-target_len] = -100
        with torch.no_grad():
            nlls.append(model(chunk, labels=labels).loss.item() * target_len)
        prev_end = end
        if end == seq_len:
            break
    return math.exp(sum(nlls) / seq_len)


baseline = compute_ppl(model, tokenizer, corpus)
print(f"Baseline: {baseline:.4f}")

for preset in ["high", "balanced", "max"]:
    with nexusquant_evict(model, quality=preset):
        ppl = compute_ppl(model, tokenizer, corpus)
    delta = (ppl - baseline) / baseline * 100
    print(f"{preset:8s}  PPL={ppl:.4f}  delta={delta:+.2f}%")

What to look for

delta < +1% at high preset → your model is a good fit, safe for production
delta > +5% at any preset → your model's attention patterns are unusual; open an issue with your model name
balanced is usually the sweet spot - 17x compression for ~1% quality cost is an excellent trade for most long-context workloads

If you share your results (model name + numbers), I'll add them to the benchmark table in the README.