Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Exercises: Robustness and Distribution Shift
There are 10 exercises. Exercises 1-3 cover mechanics, 4-6 cover theory, and 7-10 connect the math to AI evaluation workflows.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import math
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 80)
print(title)
print("=" * 80)
def check_true(condition, message):
print(f"{'PASS' if bool(condition) else 'FAIL'} - {message}")
assert bool(condition)
def check_close(actual, expected, tol=1e-8, message="values close"):
ok = abs(actual - expected) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={actual:.6f}, expected={expected:.6f}")
assert ok
def bootstrap_mean_ci(values, B=1000, alpha=0.05):
values = np.asarray(values, dtype=float)
idx = np.random.randint(0, len(values), size=(B, len(values)))
boot = values[idx].mean(axis=1)
lo, hi = np.quantile(boot, [alpha / 2, 1 - alpha / 2])
return float(values.mean()), float(lo), float(hi)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
print("Evaluation helper functions loaded.")
Exercise 1: Deployment changes the data distribution (*)
Define the estimator, compute it on a tiny synthetic example, and interpret what would make the result reliable or misleading.
Code cell 5
# Your Solution - Exercise 1
answer = None
print("answer =", answer)
Code cell 6
# Solution
header("Exercise 1: Estimate a benchmark mean")
values = np.array([1, 0, 1, 1, 0, 1, 1, 1], dtype=float)
estimate = values.mean()
check_close(estimate, 0.75, message="accuracy estimate")
print("\nTakeaway: A benchmark score is an empirical mean with a denominator.")
Exercise 2: Prompt surface as an input distribution (*)
Define the estimator, compute it on a tiny synthetic example, and interpret what would make the result reliable or misleading.
Code cell 8
# Your Solution - Exercise 2
answer = None
print("answer =", answer)
Code cell 9
# Solution
header("Exercise 2: Bootstrap an interval")
values = np.linspace(0.2, 0.9, 20)
mean, lo, hi = bootstrap_mean_ci(values, B=300)
check_true(lo < mean < hi, "bootstrap interval contains the sample mean")
print("\nTakeaway: Uncertainty belongs next to the point estimate.")
Exercise 3: Rare tails dominate reliability risk (*)
Define the estimator, compute it on a tiny synthetic example, and interpret what would make the result reliable or misleading.
Code cell 11
# Your Solution - Exercise 3
answer = None
print("answer =", answer)
Code cell 12
# Solution
header("Exercise 3: Compare paired models")
a = np.array([1, 1, 0, 1, 0, 1])
b = np.array([1, 0, 0, 1, 1, 1])
diff = (b - a).mean()
check_close(diff, 0.0, message="paired mean difference")
print("\nTakeaway: Paired comparison can reveal that apparent wins are noise.")
Exercise 4: Robustness is not only adversarial accuracy (**)
Define the estimator, compute it on a tiny synthetic example, and interpret what would make the result reliable or misleading.
Code cell 14
# Your Solution - Exercise 4
answer = None
print("answer =", answer)
Code cell 15
# Solution
header("Exercise 4: Compute calibration error")
conf = np.array([0.2, 0.4, 0.8, 0.9])
corr = np.array([0, 0, 1, 1], dtype=float)
ece = abs(conf[:2].mean() - corr[:2].mean()) * 0.5 + abs(conf[2:].mean() - corr[2:].mean()) * 0.5
check_true(ece >= 0, "ECE is nonnegative")
print("\nTakeaway: Calibration compares confidence to empirical correctness.")
Exercise 5: Reliability budgets across shifts (**)
Define the estimator, compute it on a tiny synthetic example, and interpret what would make the result reliable or misleading.
Code cell 17
# Your Solution - Exercise 5
answer = None
print("answer =", answer)
Code cell 18
# Solution
header("Exercise 5: Measure worst-group risk")
risks = np.array([0.05, 0.08, 0.22])
overall = risks.mean()
worst = risks.max()
check_true(worst >= overall, "worst group is at least average risk")
print("\nTakeaway: Reliability can be controlled by the hardest slice.")
Exercise 6: Training and test distributions (**)
Define the estimator, compute it on a tiny synthetic example, and interpret what would make the result reliable or misleading.
Code cell 20
# Your Solution - Exercise 6
answer = None
print("answer =", answer)
Code cell 21
# Solution
header("Exercise 6: Estimate an ablation effect")
full = 0.78
without_retrieval = 0.69
effect = full - without_retrieval
check_close(effect, 0.09, message="retrieval ablation effect")
print("\nTakeaway: Ablations estimate contribution under a stated protocol.")
Exercise 7: Covariate shift, label shift, and concept shift (***)
Define the estimator, compute it on a tiny synthetic example, and interpret what would make the result reliable or misleading.
Code cell 23
# Your Solution - Exercise 7
answer = None
print("answer =", answer)
Code cell 24
# Solution
header("Exercise 7: Check a factorial interaction")
y00, y10, y01, y11 = 0.60, 0.66, 0.63, 0.75
interaction = y11 - y10 - y01 + y00
check_close(interaction, 0.06, message="interaction effect")
print("\nTakeaway: Components can interact; effects need not add linearly.")
Exercise 8: Subgroup risk (***)
Define the estimator, compute it on a tiny synthetic example, and interpret what would make the result reliable or misleading.
Code cell 26
# Your Solution - Exercise 8
answer = None
print("answer =", answer)
Code cell 27
# Solution
header("Exercise 8: AB-test standard error")
n = 1000
p1, p2 = 0.10, 0.12
se = np.sqrt(p1 * (1 - p1) / n + p2 * (1 - p2) / n)
check_true(se > 0, "standard error is positive")
print("\nTakeaway: Online effects need enough sample size to be visible.")
Exercise 9: Robust risk and worst-case risk (***)
Define the estimator, compute it on a tiny synthetic example, and interpret what would make the result reliable or misleading.
Code cell 29
# Your Solution - Exercise 9
answer = None
print("answer =", answer)
Code cell 30
# Solution
header("Exercise 9: Detect sample-ratio mismatch")
obs = np.array([520, 480])
exp = np.array([500, 500])
chi2 = ((obs - exp) ** 2 / exp).sum()
check_true(chi2 >= 0, "chi-squared statistic is nonnegative")
print("\nTakeaway: Trust checks happen before interpreting treatment effects.")
Exercise 10: Threat model and perturbation set (***)
Define the estimator, compute it on a tiny synthetic example, and interpret what would make the result reliable or misleading.
Code cell 32
# Your Solution - Exercise 10
answer = None
print("answer =", answer)
Code cell 33
# Solution
header("Exercise 10: Build a decision rule")
estimate = 0.012
half_width = 0.006
ship = estimate - half_width > 0
check_true(ship, "lower confidence bound is above zero")
print("\nTakeaway: A release decision should include uncertainty, not only direction.")