Running benchmarks on someone else's hardware tells you very little. This guide shows you how to measure NexusQuant's impact on your model, your data, and your hardware in under 15 minutes.
Prerequisites
pip install nexusquant-kv transformers torch datasets
You need a HuggingFace causal LM (any model using split-half RoPE — that's every Llama, Mistral, Qwen, and Phi variant since 2023).
Step 1: Load your model
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "mistralai/Mistral-7B-v0.1" # replace with yours
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto",
)
model.eval()
If you are on a smaller GPU, use load_in_8bit=True or try a quantized checkpoint. The benchmark logic is the same.
Step 2: Compute baseline perplexity
Perplexity (PPL) is the standard quality metric for language models. Lower is better. We measure it on a fixed text corpus so results are reproducible.
import math
from datasets import load_dataset
def compute_ppl(model, tokenizer, text: str, stride: int = 512) -> float:
"""Sliding-window perplexity — handles texts longer than the context window."""
encodings = tokenizer(text, return_tensors="pt")
input_ids = encodings.input_ids.to(model.device)
seq_len = input_ids.size(1)
max_len = model.config.max_position_embeddings
nlls = []
prev_end = 0
for begin in range(0, seq_len, stride):
end = min(begin + max_len, seq_len)
target_len = end - prev_end
input_chunk = input_ids[:, begin:end]
target_chunk = input_chunk.clone()
target_chunk[:, :-target_len] = -100 # mask context
with torch.no_grad():
loss = model(input_chunk, labels=target_chunk).loss
nlls.append(loss.item() * target_len)
prev_end = end
if end == seq_len:
break
return math.exp(sum(nlls) / seq_len)
# Use the first 10 000 tokens of wikitext-2 as a fixed benchmark corpus
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
corpus = " ".join(dataset["text"][:200]) # ~10K tokens
baseline_ppl = compute_ppl(model, tokenizer, corpus)
print(f"Baseline PPL: {baseline_ppl:.4f}")
Step 3: Apply NexusQuant at each preset and re-measure
from nexusquant import nexusquant_evict
for preset in ["high", "balanced", "max"]:
# NexusQuant hooks into the model's attention layers for the duration of the `with` block
with nexusquant_evict(model, quality=preset):
nq_ppl = compute_ppl(model, tokenizer, corpus)
compression = {"high": "10x", "balanced": "17x", "max": "33x"}[preset]
delta = ((nq_ppl - baseline_ppl) / baseline_ppl) * 100
print(f"[{preset:8s}] compression={compression} PPL={nq_ppl:.4f} delta={delta:+.2f}%")
Expected output on Mistral-7B:
[high ] compression=10x PPL=... delta=+0.4%
[balanced] compression=17x PPL=... delta=+1.3%
[max ] compression=33x PPL=... delta=+2.6%
Step 4: Test on your own text
Replace corpus with any string you care about:
with open("my_domain_corpus.txt") as f:
my_text = f.read()
for preset in ["high", "balanced", "max"]:
with nexusquant_evict(model, quality=preset):
ppl = compute_ppl(model, tokenizer, my_text)
print(f"{preset}: {ppl:.4f}")
Domain-specific text (legal, medical, code) may show different degradation curves than wikitext. Always benchmark on text representative of your actual use case.
Full script (copy-paste ready)
import math
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from nexusquant import nexusquant_evict
MODEL_NAME = "mistralai/Mistral-7B-v0.1" # change this
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME, torch_dtype=torch.float16, device_map="auto"
)
model.eval()
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
corpus = " ".join(dataset["text"][:200])
def compute_ppl(model, tokenizer, text, stride=512):
encodings = tokenizer(text, return_tensors="pt")
input_ids = encodings.input_ids.to(model.device)
seq_len = input_ids.size(1)
max_len = model.config.max_position_embeddings
nlls, prev_end = [], 0
for begin in range(0, seq_len, stride):
end = min(begin + max_len, seq_len)
target_len = end - prev_end
chunk = input_ids[:, begin:end]
labels = chunk.clone()
labels[:, :-target_len] = -100
with torch.no_grad():
nlls.append(model(chunk, labels=labels).loss.item() * target_len)
prev_end = end
if end == seq_len:
break
return math.exp(sum(nlls) / seq_len)
baseline = compute_ppl(model, tokenizer, corpus)
print(f"Baseline: {baseline:.4f}")
for preset in ["high", "balanced", "max"]:
with nexusquant_evict(model, quality=preset):
ppl = compute_ppl(model, tokenizer, corpus)
delta = (ppl - baseline) / baseline * 100
print(f"{preset:8s} PPL={ppl:.4f} delta={delta:+.2f}%")
What to look for
-
delta < +1% at
highpreset → your model is a good fit, safe for production - delta > +5% at any preset → your model's attention patterns are unusual; open an issue with your model name
- balanced is usually the sweet spot — 17x compression for ~1% quality cost is an excellent trade for most long-context workloads
If you share your results (model name + numbers), I'll add them to the benchmark table in the README.
pip install nexusquant-kv
GitHub: https://github.com/jagmarques/nexusquant
Best regards, João Marques
Top comments (0)