Theory Notebook
Converted from
theory.ipynbfor web reading.
Bias Variance Tradeoff
The bias-variance tradeoff decomposes expected prediction error into approximation, estimation, and noise terms.
This notebook is the executable companion to notes.md. It uses synthetic samples so every cell is reproducible.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import math
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 72)
print(title)
print("=" * 72)
def check_true(condition, name):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
assert ok, name
def check_close(value, target, tol=1e-8, name="value"):
ok = abs(float(value) - float(target)) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {float(value):.6f}, expected {float(target):.6f}")
assert ok, name
def finite_class_bound(h_size, epsilon, delta):
return (np.log(h_size) + np.log(1.0 / delta)) / epsilon
def hoeffding_gap(h_size, m, delta):
return np.sqrt((np.log(2.0 * h_size) + np.log(1.0 / delta)) / (2.0 * m))
def bias_variance(y_hats, y_true, noise_var):
mean_pred = np.mean(y_hats, axis=0)
bias2 = np.mean((mean_pred - y_true) ** 2)
variance = np.mean(np.var(y_hats, axis=0))
return float(bias2), float(variance), float(noise_var)
def empirical_rademacher_linear(x, radius=1.0, trials=200):
x = np.asarray(x, dtype=float)
vals = []
for _ in range(trials):
sigma = np.random.choice([-1.0, 1.0], size=x.shape[0])
vals.append(radius * abs(np.sum(sigma * x)) / x.shape[0])
return float(np.mean(vals))
print("Helper functions ready.")
Demo 1: underfitting and overfitting
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 5
header("Demo 1 - underfitting and overfitting: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")
Demo 2: model complexity curve
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 7
header("Demo 2 - model complexity curve: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")
Demo 3: irreducible noise
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 9
header("Demo 3 - irreducible noise: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")
Demo 4: why train loss can mislead
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 11
header("Demo 4 - why train loss can mislead: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")
Demo 5: double descent preview
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 13
header("Demo 5 - double descent preview: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")
Demo 6: data-generating function
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 15
header("Demo 6 - data-generating function $f^\\star$: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")
Demo 7: estimator
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 17
header("Demo 7 - estimator $\\hat{f}_S$: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")
Demo 8: bias
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 19
header("Demo 8 - bias: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")
Demo 9: variance
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 21
header("Demo 9 - variance: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")
Demo 10: noise variance
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 23
header("Demo 10 - noise variance $\\sigma^2$: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")
Demo 11: pointwise decomposition
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 25
header("Demo 11 - pointwise decomposition: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")
Demo 12: integrated risk
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 27
header("Demo 12 - integrated risk: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")
Demo 13: proof sketch
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 29
header("Demo 13 - proof sketch: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")
Demo 14: simulation with polynomial regression
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 31
header("Demo 14 - simulation with polynomial regression: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")
Demo 15: interpretation
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 33
header("Demo 15 - interpretation: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")
Demo 16: model class size
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 35
header("Demo 16 - model class size: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")
Demo 17: regularization as variance control
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 37
header("Demo 17 - regularization as variance control: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")
Demo 18: early stopping preview
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 39
header("Demo 18 - early stopping preview: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")
Demo 19: hyperparameter selection
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 41
header("Demo 19 - hyperparameter selection: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")
Demo 20: validation curves
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 43
header("Demo 20 - validation curves: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")
Demo 21: interpolation threshold
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 45
header("Demo 21 - interpolation threshold: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")
Demo 22: double descent
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 47
header("Demo 22 - double descent: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")
Demo 23: benign overfitting preview
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 49
header("Demo 23 - benign overfitting preview: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")
Demo 24: deep learning caveats
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 51
header("Demo 24 - deep learning caveats: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")
Demo 25: why this does not replace bounds
This demo turns the approved TOC concept into a small executable learning-theory object.
Code cell 53
header("Demo 25 - why this does not replace bounds: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")