Theory NotebookMath for LLMs

Error Analysis and Ablations

Evaluation and Reliability / Error Analysis and Ablations

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Error Analysis and Ablations

Error analysis turns aggregate scores into failure structure; ablations test which component actually caused an improvement.

This notebook is the executable companion to notes.md. It uses synthetic data so the evaluation mathematics can run anywhere without external files.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import math

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 80)
    print(title)
    print("=" * 80)

def check_true(condition, message):
    print(f"{'PASS' if bool(condition) else 'FAIL'} - {message}")
    assert bool(condition)

def check_close(actual, expected, tol=1e-8, message="values close"):
    ok = abs(actual - expected) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={actual:.6f}, expected={expected:.6f}")
    assert ok

def bootstrap_mean_ci(values, B=1000, alpha=0.05):
    values = np.asarray(values, dtype=float)
    idx = np.random.randint(0, len(values), size=(B, len(values)))
    boot = values[idx].mean(axis=1)
    lo, hi = np.quantile(boot, [alpha / 2, 1 - alpha / 2])
    return float(values.mean()), float(lo), float(hi)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

print("Evaluation helper functions loaded.")

Demo 1: Aggregate metrics hide failure modes

This cell studies Aggregate metrics hide failure modes through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 5

header("Demo 1 - Aggregate metrics hide failure modes: confusion matrix")
y_true = np.random.randint(0, 3, size=600)
noise = np.random.rand(600) < 0.18
y_pred = y_true.copy()
y_pred[noise] = np.random.randint(0, 3, size=noise.sum())
cm = np.zeros((3, 3), dtype=int)
for t, p in zip(y_true, y_pred):
    cm[t, p] += 1
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(cm, cmap="viridis")
fig.colorbar(im, ax=ax, label="count")
ax.set_title("Confusion matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
for i in range(3):
    for j in range(3):
        ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="white")
fig.tight_layout()
plt.show()
print(cm)
check_true(cm.trace() <= cm.sum(), "diagonal cannot exceed total count")

Demo 2: Failures as structured data

This cell studies Failures as structured data through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 7

header("Demo 2 - Failures as structured data: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
    print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")

Demo 3: Ablations as causal probes

This cell studies Ablations as causal probes through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 9

header("Demo 3 - Ablations as causal probes: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
    print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")

Demo 4: Debugging without benchmark overfitting

This cell studies Debugging without benchmark overfitting through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 11

header("Demo 4 - Debugging without benchmark overfitting: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
    print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")

Demo 5: From one bug to a regression suite

This cell studies From one bug to a regression suite through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 13

header("Demo 5 - From one bug to a regression suite: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")

Demo 6: Error set

This cell studies Error set through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 15

header("Demo 6 - Error set: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")

Demo 7: Confusion matrix

This cell studies Confusion matrix through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 17

header("Demo 7 - Confusion matrix: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")

Demo 8: Slice and subgroup

This cell studies Slice and subgroup through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 19

header("Demo 8 - Slice and subgroup: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
    print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")

Demo 9: Counterfactual example

This cell studies Counterfactual example through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 21

header("Demo 9 - Counterfactual example: confusion matrix")
y_true = np.random.randint(0, 3, size=600)
noise = np.random.rand(600) < 0.18
y_pred = y_true.copy()
y_pred[noise] = np.random.randint(0, 3, size=noise.sum())
cm = np.zeros((3, 3), dtype=int)
for t, p in zip(y_true, y_pred):
    cm[t, p] += 1
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(cm, cmap="viridis")
fig.colorbar(im, ax=ax, label="count")
ax.set_title("Confusion matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
for i in range(3):
    for j in range(3):
        ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="white")
fig.tight_layout()
plt.show()
print(cm)
check_true(cm.trace() <= cm.sum(), "diagonal cannot exceed total count")

Demo 10: Ablation effect and interaction

This cell studies Ablation effect and interaction through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 23

header("Demo 10 - Ablation effect and interaction: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
    print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")

Demo 11: False positives and false negatives

This cell studies False positives and false negatives through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 25

header("Demo 11 - False positives and false negatives: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
    print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")

Demo 12: Hallucination and unsupported claims

This cell studies Hallucination and unsupported claims through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 27

header("Demo 12 - Hallucination and unsupported claims: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
    print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")

Demo 13: Format and instruction errors

This cell studies Format and instruction errors through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 29

header("Demo 13 - Format and instruction errors: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")

Demo 14: Reasoning errors

This cell studies Reasoning errors through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 31

header("Demo 14 - Reasoning errors: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")

Demo 15: Tool and retrieval failures

This cell studies Tool and retrieval failures through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 33

header("Demo 15 - Tool and retrieval failures: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")

Demo 16: Stratified metrics

This cell studies Stratified metrics through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 35

header("Demo 16 - Stratified metrics: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
    print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")

Demo 17: Subgroup confidence intervals

This cell studies Subgroup confidence intervals through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 37

header("Demo 17 - Subgroup confidence intervals: confusion matrix")
y_true = np.random.randint(0, 3, size=600)
noise = np.random.rand(600) < 0.18
y_pred = y_true.copy()
y_pred[noise] = np.random.randint(0, 3, size=noise.sum())
cm = np.zeros((3, 3), dtype=int)
for t, p in zip(y_true, y_pred):
    cm[t, p] += 1
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(cm, cmap="viridis")
fig.colorbar(im, ax=ax, label="count")
ax.set_title("Confusion matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
for i in range(3):
    for j in range(3):
        ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="white")
fig.tight_layout()
plt.show()
print(cm)
check_true(cm.trace() <= cm.sum(), "diagonal cannot exceed total count")

Demo 18: Multiple-testing control for slices

This cell studies Multiple-testing control for slices through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 39

header("Demo 18 - Multiple-testing control for slices: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
    print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")

Demo 19: Prioritizing failures

This cell studies Prioritizing failures through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 41

header("Demo 19 - Prioritizing failures: ablation effects")
base = 0.64
effects = {"retrieval": 0.09, "tool": 0.04, "rerank": 0.03, "cot": 0.02}
for name, eff in effects.items():
    print(f"remove {name:9s} -> expected score {base + sum(effects.values()) - eff:.3f}, effect={eff:.3f}")
total = base + sum(effects.values())
print(f"full system score={total:.3f}")
check_true(total > base, "full system improves over base in synthetic ablation")

Demo 20: Dashboard and report design

This cell studies Dashboard and report design through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 43

header("Demo 20 - Dashboard and report design: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
    print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")

Demo 21: Model ablations

This cell studies Model ablations through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 45

header("Demo 21 - Model ablations: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")

Demo 22: Data ablations

This cell studies Data ablations through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 47

header("Demo 22 - Data ablations: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")

Demo 23: Prompt and decoding ablations

This cell studies Prompt and decoding ablations through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 49

header("Demo 23 - Prompt and decoding ablations: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")

Demo 24: Retrieval and tool ablations

This cell studies Retrieval and tool ablations through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 51

header("Demo 24 - Retrieval and tool ablations: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
    print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")

Demo 25: Metric ablations

This cell studies Metric ablations through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 53

header("Demo 25 - Metric ablations: confusion matrix")
y_true = np.random.randint(0, 3, size=600)
noise = np.random.rand(600) < 0.18
y_pred = y_true.copy()
y_pred[noise] = np.random.randint(0, 3, size=noise.sum())
cm = np.zeros((3, 3), dtype=int)
for t, p in zip(y_true, y_pred):
    cm[t, p] += 1
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(cm, cmap="viridis")
fig.colorbar(im, ax=ax, label="count")
ax.set_title("Confusion matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
for i in range(3):
    for j in range(3):
        ax.text(j, i, str(cm[i, j]), ha="center", va="center", color="white")
fig.tight_layout()
plt.show()
print(cm)
check_true(cm.trace() <= cm.sum(), "diagonal cannot exceed total count")