Theory NotebookMath for LLMs

Capability Benchmarks

Evaluation and Reliability / Capability Benchmarks

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Capability Benchmarks

Capability benchmarks estimate what a model can do under a stated protocol; reliability begins when the protocol, metric, and uncertainty are explicit.

This notebook is the executable companion to notes.md. It uses synthetic data so the evaluation mathematics can run anywhere without external files.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import math

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 80)
    print(title)
    print("=" * 80)

def check_true(condition, message):
    print(f"{'PASS' if bool(condition) else 'FAIL'} - {message}")
    assert bool(condition)

def check_close(actual, expected, tol=1e-8, message="values close"):
    ok = abs(actual - expected) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={actual:.6f}, expected={expected:.6f}")
    assert ok

def bootstrap_mean_ci(values, B=1000, alpha=0.05):
    values = np.asarray(values, dtype=float)
    idx = np.random.randint(0, len(values), size=(B, len(values)))
    boot = values[idx].mean(axis=1)
    lo, hi = np.quantile(boot, [alpha / 2, 1 - alpha / 2])
    return float(values.mean()), float(lo), float(hi)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

print("Evaluation helper functions loaded.")

Demo 1: Benchmarks as noisy estimators

This cell studies Benchmarks as noisy estimators through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 5

header("Demo 1 - Benchmarks as noisy estimators: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")

Demo 2: Capability versus observed score

This cell studies Capability versus observed score through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 7

header("Demo 2 - Capability versus observed score: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")

Demo 3: Metric pluralism for LLM systems

This cell studies Metric pluralism for LLM systems through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 9

header("Demo 3 - Metric pluralism for LLM systems: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")

Demo 4: Benchmark lifecycle and saturation

This cell studies Benchmark lifecycle and saturation through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 11

header("Demo 4 - Benchmark lifecycle and saturation: pass-at-k estimator")
n = 200
c = 43
def pass_at_k(n, c, k):
    if n - c < k:
        return 1.0
    return 1.0 - np.prod(1 - k / np.arange(n - c + 1, n + 1))
for k in [1, 5, 10, 50]:
    print(f"pass@{k}={pass_at_k(n, c, k):.3f}")
check_true(pass_at_k(n, c, 10) >= pass_at_k(n, c, 1), "pass@k increases with k")

Demo 5: What benchmark scores can and cannot certify

This cell studies What benchmark scores can and cannot certify through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 13

header("Demo 5 - What benchmark scores can and cannot certify: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
    print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")

Demo 6: Model and system under test

This cell studies Model and system under test through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 15

header("Demo 6 - Model and system under test: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
    print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")

Demo 7: Task, item, and evaluation sample

This cell studies Task, item, and evaluation sample through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 17

header("Demo 7 - Task, item, and evaluation sample: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
    print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")

Demo 8: Prompt protocol and decoding policy

This cell studies Prompt protocol and decoding policy through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 19

header("Demo 8 - Prompt protocol and decoding policy: reliability diagram and ECE")
n = 1200
confidence = np.random.beta(4, 2, size=n)
true_prob = np.clip(confidence - 0.12 + np.random.normal(0, 0.03, size=n), 0.01, 0.99)
correct = (np.random.rand(n) < true_prob).astype(float)
bins = np.linspace(0, 1, 11)
ece = 0.0
centers, accs, confs = [], [], []
for lo, hi in zip(bins[:-1], bins[1:]):
    mask = (confidence >= lo) & (confidence < hi)
    if mask.any():
        acc = correct[mask].mean()
        conf = confidence[mask].mean()
        weight = mask.mean()
        ece += weight * abs(acc - conf)
        centers.append((lo + hi) / 2)
        accs.append(acc)
        confs.append(conf)
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], color=COLORS["neutral"], linestyle="--", label="perfect calibration")
ax.plot(confs, accs, marker="o", color=COLORS["primary"], label="model")
ax.set_title("Reliability diagram")
ax.set_xlabel("Mean confidence")
ax.set_ylabel("Empirical accuracy")
ax.legend()
fig.tight_layout()
plt.show()
print(f"ECE={ece:.4f}")
check_true(ece >= 0, "ECE is nonnegative")

Demo 9: Scorer, metric, and aggregate estimate

This cell studies Scorer, metric, and aggregate estimate through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 21

header("Demo 9 - Scorer, metric, and aggregate estimate: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")

Demo 10: Confidence interval and leaderboard rank

This cell studies Confidence interval and leaderboard rank through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 23

header("Demo 10 - Confidence interval and leaderboard rank: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")

Demo 11: Task taxonomy and coverage

This cell studies Task taxonomy and coverage through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 25

header("Demo 11 - Task taxonomy and coverage: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")

Demo 12: Dataset sampling and item independence

This cell studies Dataset sampling and item independence through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 27

header("Demo 12 - Dataset sampling and item independence: pass-at-k estimator")
n = 200
c = 43
def pass_at_k(n, c, k):
    if n - c < k:
        return 1.0
    return 1.0 - np.prod(1 - k / np.arange(n - c + 1, n + 1))
for k in [1, 5, 10, 50]:
    print(f"pass@{k}={pass_at_k(n, c, k):.3f}")
check_true(pass_at_k(n, c, 10) >= pass_at_k(n, c, 1), "pass@k increases with k")

Demo 13: Prompt templates and few-shot policy

This cell studies Prompt templates and few-shot policy through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 29

header("Demo 13 - Prompt templates and few-shot policy: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
    print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")

Demo 14: Grading functions and rubrics

This cell studies Grading functions and rubrics through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 31

header("Demo 14 - Grading functions and rubrics: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
    print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")

Demo 15: Contamination flags and eval provenance

This cell studies Contamination flags and eval provenance through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 33

header("Demo 15 - Contamination flags and eval provenance: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
    print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")

Demo 16: Accuracy and exact match

This cell studies Accuracy and exact match through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 35

header("Demo 16 - Accuracy and exact match: reliability diagram and ECE")
n = 1200
confidence = np.random.beta(4, 2, size=n)
true_prob = np.clip(confidence - 0.12 + np.random.normal(0, 0.03, size=n), 0.01, 0.99)
correct = (np.random.rand(n) < true_prob).astype(float)
bins = np.linspace(0, 1, 11)
ece = 0.0
centers, accs, confs = [], [], []
for lo, hi in zip(bins[:-1], bins[1:]):
    mask = (confidence >= lo) & (confidence < hi)
    if mask.any():
        acc = correct[mask].mean()
        conf = confidence[mask].mean()
        weight = mask.mean()
        ece += weight * abs(acc - conf)
        centers.append((lo + hi) / 2)
        accs.append(acc)
        confs.append(conf)
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], color=COLORS["neutral"], linestyle="--", label="perfect calibration")
ax.plot(confs, accs, marker="o", color=COLORS["primary"], label="model")
ax.set_title("Reliability diagram")
ax.set_xlabel("Mean confidence")
ax.set_ylabel("Empirical accuracy")
ax.legend()
fig.tight_layout()
plt.show()
print(f"ECE={ece:.4f}")
check_true(ece >= 0, "ECE is nonnegative")

Demo 17: Precision, recall, and F1

This cell studies Precision, recall, and F1 through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 37

header("Demo 17 - Precision, recall, and F1: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")

Demo 18: Pass at k for code generation

This cell studies Pass at k for code generation through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 39

header("Demo 18 - Pass at k for code generation: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")

Demo 19: Log-probability and perplexity

This cell studies Log-probability and perplexity through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 41

header("Demo 19 - Log-probability and perplexity: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")

Demo 20: Pairwise preference and judge agreement

This cell studies Pairwise preference and judge agreement through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 43

header("Demo 20 - Pairwise preference and judge agreement: pass-at-k estimator")
n = 200
c = 43
def pass_at_k(n, c, k):
    if n - c < k:
        return 1.0
    return 1.0 - np.prod(1 - k / np.arange(n - c + 1, n + 1))
for k in [1, 5, 10, 50]:
    print(f"pass@{k}={pass_at_k(n, c, k):.3f}")
check_true(pass_at_k(n, c, 10) >= pass_at_k(n, c, 1), "pass@k increases with k")

Demo 21: Bootstrap intervals

This cell studies Bootstrap intervals through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 45

header("Demo 21 - Bootstrap intervals: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
    print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")

Demo 22: Paired model comparisons

This cell studies Paired model comparisons through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 47

header("Demo 22 - Paired model comparisons: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
    print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")

Demo 23: Leaderboard uncertainty

This cell studies Leaderboard uncertainty through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 49

header("Demo 23 - Leaderboard uncertainty: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
    print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")

Demo 24: Multiple comparisons

This cell studies Multiple comparisons through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 51

header("Demo 24 - Multiple comparisons: reliability diagram and ECE")
n = 1200
confidence = np.random.beta(4, 2, size=n)
true_prob = np.clip(confidence - 0.12 + np.random.normal(0, 0.03, size=n), 0.01, 0.99)
correct = (np.random.rand(n) < true_prob).astype(float)
bins = np.linspace(0, 1, 11)
ece = 0.0
centers, accs, confs = [], [], []
for lo, hi in zip(bins[:-1], bins[1:]):
    mask = (confidence >= lo) & (confidence < hi)
    if mask.any():
        acc = correct[mask].mean()
        conf = confidence[mask].mean()
        weight = mask.mean()
        ece += weight * abs(acc - conf)
        centers.append((lo + hi) / 2)
        accs.append(acc)
        confs.append(conf)
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], color=COLORS["neutral"], linestyle="--", label="perfect calibration")
ax.plot(confs, accs, marker="o", color=COLORS["primary"], label="model")
ax.set_title("Reliability diagram")
ax.set_xlabel("Mean confidence")
ax.set_ylabel("Empirical accuracy")
ax.legend()
fig.tight_layout()
plt.show()
print(f"ECE={ece:.4f}")
check_true(ece >= 0, "ECE is nonnegative")

Demo 25: Benchmark power and sample size

This cell studies Benchmark power and sample size through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 53

header("Demo 25 - Benchmark power and sample size: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")