Theory Notebook
Converted from
theory.ipynbfor web reading.
Online Experimentation and AB Testing
Online experiments connect offline model evidence to causal user and system impact through randomized comparison, statistical inference, and trust checks.
This notebook is the executable companion to notes.md. It uses synthetic data so the evaluation mathematics can run anywhere without external files.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import math
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 80)
print(title)
print("=" * 80)
def check_true(condition, message):
print(f"{'PASS' if bool(condition) else 'FAIL'} - {message}")
assert bool(condition)
def check_close(actual, expected, tol=1e-8, message="values close"):
ok = abs(actual - expected) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={actual:.6f}, expected={expected:.6f}")
assert ok
def bootstrap_mean_ci(values, B=1000, alpha=0.05):
values = np.asarray(values, dtype=float)
idx = np.random.randint(0, len(values), size=(B, len(values)))
boot = values[idx].mean(axis=1)
lo, hi = np.quantile(boot, [alpha / 2, 1 - alpha / 2])
return float(values.mean()), float(lo), float(hi)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
print("Evaluation helper functions loaded.")
Demo 1: Offline eval predicts, online tests measure
This cell studies Offline eval predicts, online tests measure through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 5
header("Demo 1 - Offline eval predicts, online tests measure: two-proportion AB test")
n_c, n_t = 5000, 5000
p_c, p_t = 0.112, 0.124
x_c = np.random.binomial(n_c, p_c)
x_t = np.random.binomial(n_t, p_t)
phat_c, phat_t = x_c / n_c, x_t / n_t
diff = phat_t - phat_c
se = np.sqrt(phat_c * (1 - phat_c) / n_c + phat_t * (1 - phat_t) / n_t)
z = diff / se
print(f"control={phat_c:.4f}, treatment={phat_t:.4f}, diff={diff:.4f}, z={z:.2f}")
check_true(se > 0, "standard error is positive")
Demo 2: Randomization as causal design
This cell studies Randomization as causal design through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 7
header("Demo 2 - Randomization as causal design: sample size planning")
baseline = 0.10
mde = 0.01
z_alpha = 1.96
z_beta = 0.84
n_per_arm = 2 * baseline * (1 - baseline) * (z_alpha + z_beta) ** 2 / (mde ** 2)
print(f"n per arm for 1pp MDE around 10% baseline: {math.ceil(n_per_arm)}")
check_true(n_per_arm > 1000, "small effects need large online samples")
Demo 3: Overall evaluation criterion
This cell studies Overall evaluation criterion through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 9
header("Demo 3 - Overall evaluation criterion: sample-ratio mismatch")
observed = np.array([5120, 4880])
expected = observed.sum() * np.array([0.5, 0.5])
chi2 = ((observed - expected) ** 2 / expected).sum()
print(f"observed={observed.tolist()}, chi2={chi2:.3f}")
check_true(chi2 >= 0, "chi-squared statistic is nonnegative")
Demo 4: Guardrails and downside risk
This cell studies Guardrails and downside risk through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 11
header("Demo 4 - Guardrails and downside risk: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 5: Experiment culture for LLM systems
This cell studies Experiment culture for LLM systems through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 13
header("Demo 5 - Experiment culture for LLM systems: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 6: Treatment and control
This cell studies Treatment and control through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 15
header("Demo 6 - Treatment and control: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 7: Randomization unit
This cell studies Randomization unit through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 17
header("Demo 7 - Randomization unit: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 8: Overall evaluation criterion and guardrails
This cell studies Overall evaluation criterion and guardrails through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 19
header("Demo 8 - Overall evaluation criterion and guardrails: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 9: Average treatment effect
This cell studies Average treatment effect through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 21
header("Demo 9 - Average treatment effect: two-proportion AB test")
n_c, n_t = 5000, 5000
p_c, p_t = 0.112, 0.124
x_c = np.random.binomial(n_c, p_c)
x_t = np.random.binomial(n_t, p_t)
phat_c, phat_t = x_c / n_c, x_t / n_t
diff = phat_t - phat_c
se = np.sqrt(phat_c * (1 - phat_c) / n_c + phat_t * (1 - phat_t) / n_t)
z = diff / se
print(f"control={phat_c:.4f}, treatment={phat_t:.4f}, diff={diff:.4f}, z={z:.2f}")
check_true(se > 0, "standard error is positive")
Demo 10: Power and Type I or Type II errors
This cell studies Power and Type I or Type II errors through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 23
header("Demo 10 - Power and Type I or Type II errors: sample size planning")
baseline = 0.10
mde = 0.01
z_alpha = 1.96
z_beta = 0.84
n_per_arm = 2 * baseline * (1 - baseline) * (z_alpha + z_beta) ** 2 / (mde ** 2)
print(f"n per arm for 1pp MDE around 10% baseline: {math.ceil(n_per_arm)}")
check_true(n_per_arm > 1000, "small effects need large online samples")
Demo 11: Hypotheses and decision rules
This cell studies Hypotheses and decision rules through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 25
header("Demo 11 - Hypotheses and decision rules: sample-ratio mismatch")
observed = np.array([5120, 4880])
expected = observed.sum() * np.array([0.5, 0.5])
chi2 = ((observed - expected) ** 2 / expected).sum()
print(f"observed={observed.tolist()}, chi2={chi2:.3f}")
check_true(chi2 >= 0, "chi-squared statistic is nonnegative")
Demo 12: Metric hierarchy
This cell studies Metric hierarchy through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 27
header("Demo 12 - Metric hierarchy: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 13: Sample sizing
This cell studies Sample sizing through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 29
header("Demo 13 - Sample sizing: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 14: Stratification and blocking
This cell studies Stratification and blocking through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 31
header("Demo 14 - Stratification and blocking: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 15: Randomization checks
This cell studies Randomization checks through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 33
header("Demo 15 - Randomization checks: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 16: Difference in means and proportions
This cell studies Difference in means and proportions through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 35
header("Demo 16 - Difference in means and proportions: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 17: t tests and z tests
This cell studies t tests and z tests through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 37
header("Demo 17 - t tests and z tests: two-proportion AB test")
n_c, n_t = 5000, 5000
p_c, p_t = 0.112, 0.124
x_c = np.random.binomial(n_c, p_c)
x_t = np.random.binomial(n_t, p_t)
phat_c, phat_t = x_c / n_c, x_t / n_t
diff = phat_t - phat_c
se = np.sqrt(phat_c * (1 - phat_c) / n_c + phat_t * (1 - phat_t) / n_t)
z = diff / se
print(f"control={phat_c:.4f}, treatment={phat_t:.4f}, diff={diff:.4f}, z={z:.2f}")
check_true(se > 0, "standard error is positive")
Demo 18: Bootstrap confidence intervals
This cell studies Bootstrap confidence intervals through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 39
header("Demo 18 - Bootstrap confidence intervals: sample size planning")
baseline = 0.10
mde = 0.01
z_alpha = 1.96
z_beta = 0.84
n_per_arm = 2 * baseline * (1 - baseline) * (z_alpha + z_beta) ** 2 / (mde ** 2)
print(f"n per arm for 1pp MDE around 10% baseline: {math.ceil(n_per_arm)}")
check_true(n_per_arm > 1000, "small effects need large online samples")
Demo 19: CUPED preview
This cell studies CUPED preview through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 41
header("Demo 19 - CUPED preview: sample-ratio mismatch")
observed = np.array([5120, 4880])
expected = observed.sum() * np.array([0.5, 0.5])
chi2 = ((observed - expected) ** 2 / expected).sum()
print(f"observed={observed.tolist()}, chi2={chi2:.3f}")
check_true(chi2 >= 0, "chi-squared statistic is nonnegative")
Demo 20: Heterogeneous treatment effects
This cell studies Heterogeneous treatment effects through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 43
header("Demo 20 - Heterogeneous treatment effects: bootstrap uncertainty")
values = np.clip(np.random.normal(0.68, 0.18, size=500), 0, 1)
mean, lo, hi = bootstrap_mean_ci(values, B=800)
print(f"mean={mean:.3f}, bootstrap 95% CI=[{lo:.3f}, {hi:.3f}]")
check_true(hi > lo, "bootstrap interval has positive width")
Demo 21: Peeking risk
This cell studies Peeking risk through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 45
header("Demo 21 - Peeking risk: metric comparison plot")
names = np.array(["base", "prompt", "retrieval", "tool"])
scores = np.array([0.62, 0.68, 0.73, 0.70])
fig, ax = plt.subplots()
bars = ax.bar(names, scores, color=[COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]])
ax.set_title("Evaluation metric across system variants")
ax.set_xlabel("System variant")
ax.set_ylabel("Score")
ax.set_ylim(0, 1)
for bar, val in zip(bars, scores):
ax.text(bar.get_x() + bar.get_width()/2, val + 0.02, f"{val:.2f}", ha="center")
fig.tight_layout()
plt.show()
print("plotted metric comparison for four variants")
check_true(scores.max() > scores.min(), "variants differ in measured score")
Demo 22: Alpha spending
This cell studies Alpha spending through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 47
header("Demo 22 - Alpha spending: factorial interaction")
A = np.array([0, 0, 1, 1])
B = np.array([0, 1, 0, 1])
y = 0.60 + 0.05 * A + 0.03 * B + 0.08 * A * B
interaction = y[3] - y[2] - y[1] + y[0]
for a, b, val in zip(A, B, y):
print(f"A={a}, B={b}, score={val:.3f}")
print(f"interaction={interaction:.3f}")
check_close(float(interaction), 0.08, tol=1e-8, message="interaction recovered")
Demo 23: Always-valid p-values
This cell studies Always-valid p-values through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 49
header("Demo 23 - Always-valid p-values: slice metrics")
slices = np.array(["short", "long", "code", "math", "multilingual"])
acc = np.array([0.82, 0.71, 0.66, 0.58, 0.62])
n = np.array([400, 260, 180, 160, 140])
se = np.sqrt(acc * (1 - acc) / n)
for name, a, e in zip(slices, acc, se):
print(f"slice={name:12s} accuracy={a:.3f} +/- {1.96*e:.3f}")
check_true(acc.min() < acc.max(), "slices reveal heterogeneous performance")
Demo 24: Stopping rules
This cell studies Stopping rules through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 51
header("Demo 24 - Stopping rules: finite-sample accuracy interval")
n = 600
y = (np.random.rand(n) < 0.74).astype(float)
mean = y.mean()
se = np.sqrt(mean * (1 - mean) / n)
lo, hi = mean - 1.96 * se, mean + 1.96 * se
print(f"accuracy={mean:.3f}, 95% CI=[{lo:.3f}, {hi:.3f}], n={n}")
check_true(lo <= mean <= hi, "point estimate lies inside its interval")
Demo 25: Multiple online tests
This cell studies Multiple online tests through a small executable experiment. Focus on the estimator, the uncertainty statement, and the failure mode.
Code cell 53
header("Demo 25 - Multiple online tests: two-proportion AB test")
n_c, n_t = 5000, 5000
p_c, p_t = 0.112, 0.124
x_c = np.random.binomial(n_c, p_c)
x_t = np.random.binomial(n_t, p_t)
phat_c, phat_t = x_c / n_c, x_t / n_t
diff = phat_t - phat_c
se = np.sqrt(phat_c * (1 - phat_c) / n_c + phat_t * (1 - phat_t) / n_t)
z = diff / se
print(f"control={phat_c:.4f}, treatment={phat_t:.4f}, diff={diff:.4f}, z={z:.2f}")
check_true(se > 0, "standard error is positive")