Theory Notebook
Converted from
theory.ipynbfor web reading.
Capability Benchmarks
Capability benchmarks estimate what a model can do under a stated protocol; reliability begins when the protocol, metric, and uncertainty are explicit.
This notebook is the executable companion to notes.md. It uses synthetic data so the evaluation mathematics can run anywhere without external files.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import math
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 80)
print(title)
print("=" * 80)
def check_true(condition, message):
print(f"{'PASS' if bool(condition) else 'FAIL'} - {message}")
assert bool(condition)
def check_close(actual, expected, tol=1e-8, message="values close"):
ok = abs(actual - expected) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={actual:.6f}, expected={expected:.6f}")
assert ok
def bootstrap_mean_ci(values, B=1000, alpha=0.05):
values = np.asarray(values, dtype=float)
idx = np.random.randint(0, len(values), size=(B, len(values)))
boot = values[idx].mean(axis=1)
lo, hi = np.quantile(boot, [alpha / 2, 1 - alpha / 2])
return float(values.mean()), float(lo), float(hi)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
print("Evaluation helper functions loaded.")
Demo 1: Benchmarks as noisy estimators
This cell studies Benchmarks as noisy estimators through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 5
header("Demo 1 - Benchmarks as noisy estimators: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 2: Capability versus observed score
This cell studies Capability versus observed score through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 7
header("Demo 2 - Capability versus observed score: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 3: Metric pluralism for LLM systems
This cell studies Metric pluralism for LLM systems through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 9
header("Demo 3 - Metric pluralism for LLM systems: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 4: Benchmark lifecycle and saturation
This cell studies Benchmark lifecycle and saturation through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 11
header("Demo 4 - Benchmark lifecycle and saturation: pass-at-k estimator")
n = 200
c = 43
def pass_at_k(n, c, k):
if n - c < k:
return 1.0
return 1.0 - np.prod(1 - k / np.arange(n - c + 1, n + 1))
for k in [1, 5, 10, 50]:
print(f"pass@{k}={pass_at_k(n, c, k):.3f}")
check_true(pass_at_k(n, c, 10) >= pass_at_k(n, c, 1), "pass@k increases with k")
Demo 5: What benchmark scores can and cannot certify
This cell studies What benchmark scores can and cannot certify through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 13
header("Demo 5 - What benchmark scores can and cannot certify: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 6: Model and system under test
This cell studies Model and system under test through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 15
header("Demo 6 - Model and system under test: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 7: Task, item, and evaluation sample
This cell studies Task, item, and evaluation sample through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 17
header("Demo 7 - Task, item, and evaluation sample: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")
Demo 8: Prompt protocol and decoding policy
This cell studies Prompt protocol and decoding policy through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 19
header("Demo 8 - Prompt protocol and decoding policy: reliability diagram and ECE")
n = 1200
confidence = np.random.beta(4, 2, size=n)
true_prob = np.clip(confidence - 0.12 + np.random.normal(0, 0.03, size=n), 0.01, 0.99)
correct = (np.random.rand(n) < true_prob).astype(float)
bins = np.linspace(0, 1, 11)
ece = 0.0
centers, accs, confs = [], [], []
for lo, hi in zip(bins[:-1], bins[1:]):
mask = (confidence >= lo) & (confidence < hi)
if mask.any():
acc = correct[mask].mean()
conf = confidence[mask].mean()
weight = mask.mean()
ece += weight * abs(acc - conf)
centers.append((lo + hi) / 2)
accs.append(acc)
confs.append(conf)
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], color=COLORS["neutral"], linestyle="--", label="perfect calibration")
ax.plot(confs, accs, marker="o", color=COLORS["primary"], label="model")
ax.set_title("Reliability diagram")
ax.set_xlabel("Mean confidence")
ax.set_ylabel("Empirical accuracy")
ax.legend()
fig.tight_layout()
plt.show()
print(f"ECE={ece:.4f}")
check_true(ece >= 0, "ECE is nonnegative")
Demo 9: Scorer, metric, and aggregate estimate
This cell studies Scorer, metric, and aggregate estimate through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 21
header("Demo 9 - Scorer, metric, and aggregate estimate: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 10: Confidence interval and leaderboard rank
This cell studies Confidence interval and leaderboard rank through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 23
header("Demo 10 - Confidence interval and leaderboard rank: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 11: Task taxonomy and coverage
This cell studies Task taxonomy and coverage through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 25
header("Demo 11 - Task taxonomy and coverage: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 12: Dataset sampling and item independence
This cell studies Dataset sampling and item independence through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 27
header("Demo 12 - Dataset sampling and item independence: pass-at-k estimator")
n = 200
c = 43
def pass_at_k(n, c, k):
if n - c < k:
return 1.0
return 1.0 - np.prod(1 - k / np.arange(n - c + 1, n + 1))
for k in [1, 5, 10, 50]:
print(f"pass@{k}={pass_at_k(n, c, k):.3f}")
check_true(pass_at_k(n, c, 10) >= pass_at_k(n, c, 1), "pass@k increases with k")
Demo 13: Prompt templates and few-shot policy
This cell studies Prompt templates and few-shot policy through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 29
header("Demo 13 - Prompt templates and few-shot policy: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 14: Grading functions and rubrics
This cell studies Grading functions and rubrics through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 31
header("Demo 14 - Grading functions and rubrics: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 15: Contamination flags and eval provenance
This cell studies Contamination flags and eval provenance through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 33
header("Demo 15 - Contamination flags and eval provenance: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")
Demo 16: Accuracy and exact match
This cell studies Accuracy and exact match through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 35
header("Demo 16 - Accuracy and exact match: reliability diagram and ECE")
n = 1200
confidence = np.random.beta(4, 2, size=n)
true_prob = np.clip(confidence - 0.12 + np.random.normal(0, 0.03, size=n), 0.01, 0.99)
correct = (np.random.rand(n) < true_prob).astype(float)
bins = np.linspace(0, 1, 11)
ece = 0.0
centers, accs, confs = [], [], []
for lo, hi in zip(bins[:-1], bins[1:]):
mask = (confidence >= lo) & (confidence < hi)
if mask.any():
acc = correct[mask].mean()
conf = confidence[mask].mean()
weight = mask.mean()
ece += weight * abs(acc - conf)
centers.append((lo + hi) / 2)
accs.append(acc)
confs.append(conf)
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], color=COLORS["neutral"], linestyle="--", label="perfect calibration")
ax.plot(confs, accs, marker="o", color=COLORS["primary"], label="model")
ax.set_title("Reliability diagram")
ax.set_xlabel("Mean confidence")
ax.set_ylabel("Empirical accuracy")
ax.legend()
fig.tight_layout()
plt.show()
print(f"ECE={ece:.4f}")
check_true(ece >= 0, "ECE is nonnegative")
Demo 17: Precision, recall, and F1
This cell studies Precision, recall, and F1 through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 37
header("Demo 17 - Precision, recall, and F1: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 18: Pass at k for code generation
This cell studies Pass at k for code generation through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 39
header("Demo 18 - Pass at k for code generation: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 19: Log-probability and perplexity
This cell studies Log-probability and perplexity through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 41
header("Demo 19 - Log-probability and perplexity: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 20: Pairwise preference and judge agreement
This cell studies Pairwise preference and judge agreement through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 43
header("Demo 20 - Pairwise preference and judge agreement: pass-at-k estimator")
n = 200
c = 43
def pass_at_k(n, c, k):
if n - c < k:
return 1.0
return 1.0 - np.prod(1 - k / np.arange(n - c + 1, n + 1))
for k in [1, 5, 10, 50]:
print(f"pass@{k}={pass_at_k(n, c, k):.3f}")
check_true(pass_at_k(n, c, 10) >= pass_at_k(n, c, 1), "pass@k increases with k")
Demo 21: Bootstrap intervals
This cell studies Bootstrap intervals through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 45
header("Demo 21 - Bootstrap intervals: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 22: Paired model comparisons
This cell studies Paired model comparisons through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 47
header("Demo 22 - Paired model comparisons: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 23: Leaderboard uncertainty
This cell studies Leaderboard uncertainty through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 49
header("Demo 23 - Leaderboard uncertainty: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")
Demo 24: Multiple comparisons
This cell studies Multiple comparisons through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 51
header("Demo 24 - Multiple comparisons: reliability diagram and ECE")
n = 1200
confidence = np.random.beta(4, 2, size=n)
true_prob = np.clip(confidence - 0.12 + np.random.normal(0, 0.03, size=n), 0.01, 0.99)
correct = (np.random.rand(n) < true_prob).astype(float)
bins = np.linspace(0, 1, 11)
ece = 0.0
centers, accs, confs = [], [], []
for lo, hi in zip(bins[:-1], bins[1:]):
mask = (confidence >= lo) & (confidence < hi)
if mask.any():
acc = correct[mask].mean()
conf = confidence[mask].mean()
weight = mask.mean()
ece += weight * abs(acc - conf)
centers.append((lo + hi) / 2)
accs.append(acc)
confs.append(conf)
fig, ax = plt.subplots()
ax.plot([0, 1], [0, 1], color=COLORS["neutral"], linestyle="--", label="perfect calibration")
ax.plot(confs, accs, marker="o", color=COLORS["primary"], label="model")
ax.set_title("Reliability diagram")
ax.set_xlabel("Mean confidence")
ax.set_ylabel("Empirical accuracy")
ax.legend()
fig.tight_layout()
plt.show()
print(f"ECE={ece:.4f}")
check_true(ece >= 0, "ECE is nonnegative")
Demo 25: Benchmark power and sample size
This cell studies Benchmark power and sample size through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 53
header("Demo 25 - Benchmark power and sample size: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")