Theory NotebookMath for LLMs

Robustness and Distribution Shift

Evaluation and Reliability / Robustness and Distribution Shift

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Robustness and Distribution Shift

Robustness evaluation measures how model risk changes when the test distribution, prompt surface, subgroup, or adversary changes.

This notebook is the executable companion to notes.md. It uses synthetic data so the evaluation mathematics can run anywhere without external files.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import math

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 80)
    print(title)
    print("=" * 80)

def check_true(condition, message):
    print(f"{'PASS' if bool(condition) else 'FAIL'} - {message}")
    assert bool(condition)

def check_close(actual, expected, tol=1e-8, message="values close"):
    ok = abs(actual - expected) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={actual:.6f}, expected={expected:.6f}")
    assert ok

def bootstrap_mean_ci(values, B=1000, alpha=0.05):
    values = np.asarray(values, dtype=float)
    idx = np.random.randint(0, len(values), size=(B, len(values)))
    boot = values[idx].mean(axis=1)
    lo, hi = np.quantile(boot, [alpha / 2, 1 - alpha / 2])
    return float(values.mean()), float(lo), float(hi)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

print("Evaluation helper functions loaded.")

Demo 1: Deployment changes the data distribution

This cell studies Deployment changes the data distribution through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 5

header("Demo 1 - Deployment changes the data distribution: distribution shift")
train = np.random.normal(0.0, 1.0, size=(800, 2))
test = np.random.normal([0.7, -0.4], [1.2, 0.8], size=(800, 2))
train_mean = train.mean(axis=0)
test_mean = test.mean(axis=0)
shift_norm = np.linalg.norm(test_mean - train_mean)
fig, ax = plt.subplots()
ax.scatter(train[:200, 0], train[:200, 1], s=18, alpha=0.5, color=COLORS["primary"], label="train")
ax.scatter(test[:200, 0], test[:200, 1], s=18, alpha=0.5, color=COLORS["secondary"], label="test")
ax.set_title("Synthetic covariate shift")
ax.set_xlabel("feature 1")
ax.set_ylabel("feature 2")
ax.legend()
fig.tight_layout()
plt.show()
print(f"mean-shift norm={shift_norm:.3f}")
check_true(shift_norm > 0.2, "synthetic shift is detectable")

Demo 2: Prompt surface as an input distribution

This cell studies Prompt surface as an input distribution through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 7

header("Demo 2 - Prompt surface as an input distribution: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
    print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")

Demo 3: Rare tails dominate reliability risk

This cell studies Rare tails dominate reliability risk through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 9

header("Demo 3 - Rare tails dominate reliability risk: tail risk via CVaR")
losses = np.random.lognormal(mean=-2.0, sigma=0.8, size=1200)
alpha = 0.9
threshold = np.quantile(losses, alpha)
cvar = losses[losses >= threshold].mean()
print(f"mean loss={losses.mean():.4f}, 90% tail threshold={threshold:.4f}, CVaR={cvar:.4f}")
check_true(cvar >= losses.mean(), "tail risk exceeds average risk")

Demo 4: Robustness is not only adversarial accuracy

This cell studies Robustness is not only adversarial accuracy through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 11

header("Demo 4 - Robustness is not only adversarial accuracy: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
    print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")

Demo 5: Reliability budgets across shifts

This cell studies Reliability budgets across shifts through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 13

header("Demo 5 - Reliability budgets across shifts: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")

Demo 6: Training and test distributions

This cell studies Training and test distributions through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 15

header("Demo 6 - Training and test distributions: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")

Demo 7: Covariate shift, label shift, and concept shift

This cell studies Covariate shift, label shift, and concept shift through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 17

header("Demo 7 - Covariate shift, label shift, and concept shift: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")

Demo 8: Subgroup risk

This cell studies Subgroup risk through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 19

header("Demo 8 - Subgroup risk: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
    print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")

Demo 9: Robust risk and worst-case risk

This cell studies Robust risk and worst-case risk through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 21

header("Demo 9 - Robust risk and worst-case risk: distribution shift")
train = np.random.normal(0.0, 1.0, size=(800, 2))
test = np.random.normal([0.7, -0.4], [1.2, 0.8], size=(800, 2))
train_mean = train.mean(axis=0)
test_mean = test.mean(axis=0)
shift_norm = np.linalg.norm(test_mean - train_mean)
fig, ax = plt.subplots()
ax.scatter(train[:200, 0], train[:200, 1], s=18, alpha=0.5, color=COLORS["primary"], label="train")
ax.scatter(test[:200, 0], test[:200, 1], s=18, alpha=0.5, color=COLORS["secondary"], label="test")
ax.set_title("Synthetic covariate shift")
ax.set_xlabel("feature 1")
ax.set_ylabel("feature 2")
ax.legend()
fig.tight_layout()
plt.show()
print(f"mean-shift norm={shift_norm:.3f}")
check_true(shift_norm > 0.2, "synthetic shift is detectable")

Demo 10: Threat model and perturbation set

This cell studies Threat model and perturbation set through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 23

header("Demo 10 - Threat model and perturbation set: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
    print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")

Demo 11: Two-sample tests

This cell studies Two-sample tests through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 25

header("Demo 11 - Two-sample tests: tail risk via CVaR")
losses = np.random.lognormal(mean=-2.0, sigma=0.8, size=1200)
alpha = 0.9
threshold = np.quantile(losses, alpha)
cvar = losses[losses >= threshold].mean()
print(f"mean loss={losses.mean():.4f}, 90% tail threshold={threshold:.4f}, CVaR={cvar:.4f}")
check_true(cvar >= losses.mean(), "tail risk exceeds average risk")

Demo 12: MMD and Wasserstein previews

This cell studies MMD and Wasserstein previews through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 27

header("Demo 12 - MMD and Wasserstein previews: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
    print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")

Demo 13: Embedding drift

This cell studies Embedding drift through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 29

header("Demo 13 - Embedding drift: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")

Demo 14: Slice drift

This cell studies Slice drift through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 31

header("Demo 14 - Slice drift: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")

Demo 15: OOD score functions

This cell studies OOD score functions through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 33

header("Demo 15 - OOD score functions: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")

Demo 16: Perturbation tests

This cell studies Perturbation tests through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 35

header("Demo 16 - Perturbation tests: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
    print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")

Demo 17: Stress tests

This cell studies Stress tests through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 37

header("Demo 17 - Stress tests: distribution shift")
train = np.random.normal(0.0, 1.0, size=(800, 2))
test = np.random.normal([0.7, -0.4], [1.2, 0.8], size=(800, 2))
train_mean = train.mean(axis=0)
test_mean = test.mean(axis=0)
shift_norm = np.linalg.norm(test_mean - train_mean)
fig, ax = plt.subplots()
ax.scatter(train[:200, 0], train[:200, 1], s=18, alpha=0.5, color=COLORS["primary"], label="train")
ax.scatter(test[:200, 0], test[:200, 1], s=18, alpha=0.5, color=COLORS["secondary"], label="test")
ax.set_title("Synthetic covariate shift")
ax.set_xlabel("feature 1")
ax.set_ylabel("feature 2")
ax.legend()
fig.tight_layout()
plt.show()
print(f"mean-shift norm={shift_norm:.3f}")
check_true(shift_norm > 0.2, "synthetic shift is detectable")

Demo 18: Adversarial examples

This cell studies Adversarial examples through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 39

header("Demo 18 - Adversarial examples: worst-group risk")
groups = np.array(["A", "B", "C", "D"])
counts = np.array([500, 300, 120, 80])
risks = np.array([0.08, 0.11, 0.19, 0.27])
overall = np.average(risks, weights=counts)
worst = risks.max()
for g, n, r in zip(groups, counts, risks):
    print(f"group={g}, n={n}, risk={r:.3f}")
print(f"overall risk={overall:.3f}, worst-group risk={worst:.3f}")
check_true(worst >= overall, "worst-group risk upper-bounds weighted average risk")

Demo 19: Common corruptions

This cell studies Common corruptions through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 41

header("Demo 19 - Common corruptions: tail risk via CVaR")
losses = np.random.lognormal(mean=-2.0, sigma=0.8, size=1200)
alpha = 0.9
threshold = np.quantile(losses, alpha)
cvar = losses[losses >= threshold].mean()
print(f"mean loss={losses.mean():.4f}, 90% tail threshold={threshold:.4f}, CVaR={cvar:.4f}")
check_true(cvar >= losses.mean(), "tail risk exceeds average risk")

Demo 20: Threat-model reporting

This cell studies Threat-model reporting through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 43

header("Demo 20 - Threat-model reporting: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
    print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")

Demo 21: Worst-group accuracy

This cell studies Worst-group accuracy through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 45

header("Demo 21 - Worst-group accuracy: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")

Demo 22: Conditional value at risk

This cell studies Conditional value at risk through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 47

header("Demo 22 - Conditional value at risk: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")

Demo 23: Tail loss

This cell studies Tail loss through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 49

header("Demo 23 - Tail loss: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
    ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")

Demo 24: Distributionally robust evaluation

This cell studies Distributionally robust evaluation through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 51

header("Demo 24 - Distributionally robust evaluation: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
    print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")

Demo 25: Fairness and privacy side effects

This cell studies Fairness and privacy side effects through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.

Code cell 53

header("Demo 25 - Fairness and privacy side effects: distribution shift")
train = np.random.normal(0.0, 1.0, size=(800, 2))
test = np.random.normal([0.7, -0.4], [1.2, 0.8], size=(800, 2))
train_mean = train.mean(axis=0)
test_mean = test.mean(axis=0)
shift_norm = np.linalg.norm(test_mean - train_mean)
fig, ax = plt.subplots()
ax.scatter(train[:200, 0], train[:200, 1], s=18, alpha=0.5, color=COLORS["primary"], label="train")
ax.scatter(test[:200, 0], test[:200, 1], s=18, alpha=0.5, color=COLORS["secondary"], label="test")
ax.set_title("Synthetic covariate shift")
ax.set_xlabel("feature 1")
ax.set_ylabel("feature 2")
ax.legend()
fig.tight_layout()
plt.show()
print(f"mean-shift norm={shift_norm:.3f}")
check_true(shift_norm > 0.2, "synthetic shift is detectable")