Theory Notebook
Converted from
theory.ipynbfor web reading.
Robustness and Distribution Shift
Robustness evaluation measures how model risk changes when the test distribution, prompt surface, subgroup, or adversary changes.
This notebook is the executable companion to notes.md. It uses synthetic data so the evaluation mathematics can run anywhere without external files.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import math
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 80)
print(title)
print("=" * 80)
def check_true(condition, message):
print(f"{'PASS' if bool(condition) else 'FAIL'} - {message}")
assert bool(condition)
def check_close(actual, expected, tol=1e-8, message="values close"):
ok = abs(actual - expected) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={actual:.6f}, expected={expected:.6f}")
assert ok
def bootstrap_mean_ci(values, B=1000, alpha=0.05):
values = np.asarray(values, dtype=float)
idx = np.random.randint(0, len(values), size=(B, len(values)))
boot = values[idx].mean(axis=1)
lo, hi = np.quantile(boot, [alpha / 2, 1 - alpha / 2])
return float(values.mean()), float(lo), float(hi)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
print("Evaluation helper functions loaded.")
Demo 1: Deployment changes the data distribution
This cell studies Deployment changes the data distribution through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 5
header("Demo 1 - Deployment changes the data distribution: distribution shift")
train = np.random.normal(0.0, 1.0, size=(800, 2))
test = np.random.normal([0.7, -0.4], [1.2, 0.8], size=(800, 2))
train_mean = train.mean(axis=0)
test_mean = test.mean(axis=0)
shift_norm = np.linalg.norm(test_mean - train_mean)
fig, ax = plt.subplots()
ax.scatter(train[:200, 0], train[:200, 1], s=18, alpha=0.5, color=COLORS["primary"], label="train")
ax.scatter(test[:200, 0], test[:200, 1], s=18, alpha=0.5, color=COLORS["secondary"], label="test")
ax.set_title("Synthetic covariate shift")
ax.set_xlabel("feature 1")
ax.set_ylabel("feature 2")
ax.legend()
fig.tight_layout()
plt.show()
print(f"mean-shift norm={shift_norm:.3f}")
check_true(shift_norm > 0.2, "synthetic shift is detectable")
Demo 2: Prompt surface as an input distribution
This cell studies Prompt surface as an input distribution through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 7
header("Demo 2 - Prompt surface as an input distribution: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")
Demo 3: Rare tails dominate reliability risk
This cell studies Rare tails dominate reliability risk through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 9
header("Demo 3 - Rare tails dominate reliability risk: tail risk via CVaR")
losses = np.random.lognormal(mean=-2.0, sigma=0.8, size=1200)
alpha = 0.9
threshold = np.quantile(losses, alpha)
cvar = losses[losses >= threshold].mean()
print(f"mean loss={losses.mean():.4f}, 90% tail threshold={threshold:.4f}, CVaR={cvar:.4f}")
check_true(cvar >= losses.mean(), "tail risk exceeds average risk")
Demo 4: Robustness is not only adversarial accuracy
This cell studies Robustness is not only adversarial accuracy through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 11
header("Demo 4 - Robustness is not only adversarial accuracy: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 5: Reliability budgets across shifts
This cell studies Reliability budgets across shifts through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 13
header("Demo 5 - Reliability budgets across shifts: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 6: Training and test distributions
This cell studies Training and test distributions through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 15
header("Demo 6 - Training and test distributions: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 7: Covariate shift, label shift, and concept shift
This cell studies Covariate shift, label shift, and concept shift through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 17
header("Demo 7 - Covariate shift, label shift, and concept shift: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 8: Subgroup risk
This cell studies Subgroup risk through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 19
header("Demo 8 - Subgroup risk: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 9: Robust risk and worst-case risk
This cell studies Robust risk and worst-case risk through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 21
header("Demo 9 - Robust risk and worst-case risk: distribution shift")
train = np.random.normal(0.0, 1.0, size=(800, 2))
test = np.random.normal([0.7, -0.4], [1.2, 0.8], size=(800, 2))
train_mean = train.mean(axis=0)
test_mean = test.mean(axis=0)
shift_norm = np.linalg.norm(test_mean - train_mean)
fig, ax = plt.subplots()
ax.scatter(train[:200, 0], train[:200, 1], s=18, alpha=0.5, color=COLORS["primary"], label="train")
ax.scatter(test[:200, 0], test[:200, 1], s=18, alpha=0.5, color=COLORS["secondary"], label="test")
ax.set_title("Synthetic covariate shift")
ax.set_xlabel("feature 1")
ax.set_ylabel("feature 2")
ax.legend()
fig.tight_layout()
plt.show()
print(f"mean-shift norm={shift_norm:.3f}")
check_true(shift_norm > 0.2, "synthetic shift is detectable")
Demo 10: Threat model and perturbation set
This cell studies Threat model and perturbation set through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 23
header("Demo 10 - Threat model and perturbation set: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")
Demo 11: Two-sample tests
This cell studies Two-sample tests through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 25
header("Demo 11 - Two-sample tests: tail risk via CVaR")
losses = np.random.lognormal(mean=-2.0, sigma=0.8, size=1200)
alpha = 0.9
threshold = np.quantile(losses, alpha)
cvar = losses[losses >= threshold].mean()
print(f"mean loss={losses.mean():.4f}, 90% tail threshold={threshold:.4f}, CVaR={cvar:.4f}")
check_true(cvar >= losses.mean(), "tail risk exceeds average risk")
Demo 12: MMD and Wasserstein previews
This cell studies MMD and Wasserstein previews through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 27
header("Demo 12 - MMD and Wasserstein previews: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 13: Embedding drift
This cell studies Embedding drift through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 29
header("Demo 13 - Embedding drift: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 14: Slice drift
This cell studies Slice drift through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 31
header("Demo 14 - Slice drift: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 15: OOD score functions
This cell studies OOD score functions through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 33
header("Demo 15 - OOD score functions: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 16: Perturbation tests
This cell studies Perturbation tests through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 35
header("Demo 16 - Perturbation tests: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 17: Stress tests
This cell studies Stress tests through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 37
header("Demo 17 - Stress tests: distribution shift")
train = np.random.normal(0.0, 1.0, size=(800, 2))
test = np.random.normal([0.7, -0.4], [1.2, 0.8], size=(800, 2))
train_mean = train.mean(axis=0)
test_mean = test.mean(axis=0)
shift_norm = np.linalg.norm(test_mean - train_mean)
fig, ax = plt.subplots()
ax.scatter(train[:200, 0], train[:200, 1], s=18, alpha=0.5, color=COLORS["primary"], label="train")
ax.scatter(test[:200, 0], test[:200, 1], s=18, alpha=0.5, color=COLORS["secondary"], label="test")
ax.set_title("Synthetic covariate shift")
ax.set_xlabel("feature 1")
ax.set_ylabel("feature 2")
ax.legend()
fig.tight_layout()
plt.show()
print(f"mean-shift norm={shift_norm:.3f}")
check_true(shift_norm > 0.2, "synthetic shift is detectable")
Demo 18: Adversarial examples
This cell studies Adversarial examples through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 39
header("Demo 18 - Adversarial examples: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")
Demo 19: Common corruptions
This cell studies Common corruptions through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 41
header("Demo 19 - Common corruptions: tail risk via CVaR")
losses = np.random.lognormal(mean=-2.0, sigma=0.8, size=1200)
alpha = 0.9
threshold = np.quantile(losses, alpha)
cvar = losses[losses >= threshold].mean()
print(f"mean loss={losses.mean():.4f}, 90% tail threshold={threshold:.4f}, CVaR={cvar:.4f}")
check_true(cvar >= losses.mean(), "tail risk exceeds average risk")
Demo 20: Threat-model reporting
This cell studies Threat-model reporting through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 43
header("Demo 20 - Threat-model reporting: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 21: Worst-group accuracy
This cell studies Worst-group accuracy through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 45
header("Demo 21 - Worst-group accuracy: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 22: Conditional value at risk
This cell studies Conditional value at risk through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 47
header("Demo 22 - Conditional value at risk: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 23: Tail loss
This cell studies Tail loss through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 49
header("Demo 23 - Tail loss: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 24: Distributionally robust evaluation
This cell studies Distributionally robust evaluation through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 51
header("Demo 24 - Distributionally robust evaluation: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 25: Fairness and privacy side effects
This cell studies Fairness and privacy side effects through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 53
header("Demo 25 - Fairness and privacy side effects: distribution shift")
train = np.random.normal(0.0, 1.0, size=(800, 2))
test = np.random.normal([0.7, -0.4], [1.2, 0.8], size=(800, 2))
train_mean = train.mean(axis=0)
test_mean = test.mean(axis=0)
shift_norm = np.linalg.norm(test_mean - train_mean)
fig, ax = plt.subplots()
ax.scatter(train[:200, 0], train[:200, 1], s=18, alpha=0.5, color=COLORS["primary"], label="train")
ax.scatter(test[:200, 0], test[:200, 1], s=18, alpha=0.5, color=COLORS["secondary"], label="test")
ax.set_title("Synthetic covariate shift")
ax.set_xlabel("feature 1")
ax.set_ylabel("feature 2")
ax.legend()
fig.tight_layout()
plt.show()
print(f"mean-shift norm={shift_norm:.3f}")
check_true(shift_norm > 0.2, "synthetic shift is detectable")