Theory NotebookMath for LLMs

Generalization Bounds

Statistical Learning Theory / Generalization Bounds

Notesnotes.md Theory Notebooktheory.ipynb Exercises Notebookexercises.ipynb

Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Generalization Bounds

Generalization bounds quantify when empirical performance is likely to approximate population performance.

This notebook is the executable companion to notes.md. It uses synthetic samples so every cell is reproducible.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


import math

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    ok = abs(float(value) - float(target)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {float(value):.6f}, expected {float(target):.6f}")
    assert ok, name

def finite_class_bound(h_size, epsilon, delta):
    return (np.log(h_size) + np.log(1.0 / delta)) / epsilon

def hoeffding_gap(h_size, m, delta):
    return np.sqrt((np.log(2.0 * h_size) + np.log(1.0 / delta)) / (2.0 * m))

def bias_variance(y_hats, y_true, noise_var):
    mean_pred = np.mean(y_hats, axis=0)
    bias2 = np.mean((mean_pred - y_true) ** 2)
    variance = np.mean(np.var(y_hats, axis=0))
    return float(bias2), float(variance), float(noise_var)

def empirical_rademacher_linear(x, radius=1.0, trials=200):
    x = np.asarray(x, dtype=float)
    vals = []
    for _ in range(trials):
        sigma = np.random.choice([-1.0, 1.0], size=x.shape[0])
        vals.append(radius * abs(np.sum(sigma * x)) / x.shape[0])
    return float(np.mean(vals))

print("Helper functions ready.")

Demo 1: from empirical risk to true risk

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 5

header("Demo 1 - from empirical risk to true risk: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")

Demo 2: concentration as bridge

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 7

header("Demo 2 - concentration as bridge: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")

Demo 3: uniform bounds

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 9

header("Demo 3 - uniform bounds: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")

Demo 4: margin and norm bounds

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 11

header("Demo 4 - margin and norm bounds: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
    predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")

Demo 5: why bounds can be loose but useful

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 13

header("Demo 5 - why bounds can be loose but useful: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")

Demo 6: true risk and empirical risk

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 15

header("Demo 6 - true risk and empirical risk: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")

Demo 7: generalization gap

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 17

header("Demo 7 - generalization gap: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")

Demo 8: uniform convergence

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 19

header("Demo 8 - uniform convergence: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")

Demo 9: confidence parameter $\delta$

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 21

header("Demo 9 - confidence parameter $\\delta$: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")

Demo 10: bound template

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 23

header("Demo 10 - bound template: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
    predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")

Demo 11: Hoeffding inequality recall

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 25

header("Demo 11 - Hoeffding inequality recall: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")

Demo 12: union bound over $\mathcal{H}$

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 27

header("Demo 12 - union bound over $\\mathcal{H}$: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")

Demo 13: realizable bound

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 29

header("Demo 13 - realizable bound: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")

Demo 14: agnostic bound

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 31

header("Demo 14 - agnostic bound: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")

Demo 15: sample-size calculator

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 33

header("Demo 15 - sample-size calculator: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")

Demo 16: VC bound

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 35

header("Demo 16 - VC bound: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
    predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")

Demo 17: Rademacher preview

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 37

header("Demo 17 - Rademacher preview: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")

Demo 18: covering number preview

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 39

header("Demo 18 - covering number preview: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")

Demo 19: compression preview

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 41

header("Demo 19 - compression preview: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")

Demo 20: stability preview

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 43

header("Demo 20 - stability preview: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")

Demo 21: margins for classifiers

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 45

header("Demo 21 - margins for classifiers: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")

Demo 22: perceptron and SVM intuition

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 47

header("Demo 22 - perceptron and SVM intuition: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
    predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")

Demo 23: norm-based neural network bounds

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 49

header("Demo 23 - norm-based neural network bounds: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")

Demo 24: PAC-Bayes preview

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 51

header("Demo 24 - PAC-Bayes preview: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")

Demo 25: calibration to practice

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 53

header("Demo 25 - calibration to practice: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")

Previous lesson Next lesson