Theory NotebookMath for LLMs

Rademacher Complexity

Statistical Learning Theory / Rademacher Complexity

Notesnotes.md Theory Notebooktheory.ipynb Exercises Notebookexercises.ipynb

Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Rademacher Complexity

Rademacher complexity is a data-dependent measure of how well a function class correlates with random signs.

This notebook is the executable companion to notes.md. It uses synthetic samples so every cell is reproducible.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


import math

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    ok = abs(float(value) - float(target)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {float(value):.6f}, expected {float(target):.6f}")
    assert ok, name

def finite_class_bound(h_size, epsilon, delta):
    return (np.log(h_size) + np.log(1.0 / delta)) / epsilon

def hoeffding_gap(h_size, m, delta):
    return np.sqrt((np.log(2.0 * h_size) + np.log(1.0 / delta)) / (2.0 * m))

def bias_variance(y_hats, y_true, noise_var):
    mean_pred = np.mean(y_hats, axis=0)
    bias2 = np.mean((mean_pred - y_true) ** 2)
    variance = np.mean(np.var(y_hats, axis=0))
    return float(bias2), float(variance), float(noise_var)

def empirical_rademacher_linear(x, radius=1.0, trials=200):
    x = np.asarray(x, dtype=float)
    vals = []
    for _ in range(trials):
        sigma = np.random.choice([-1.0, 1.0], size=x.shape[0])
        vals.append(radius * abs(np.sum(sigma * x)) / x.shape[0])
    return float(np.mean(vals))

print("Helper functions ready.")

Demo 1: fitting random noise as complexity

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 5

header("Demo 1 - fitting random noise as complexity: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")

Demo 2: data-dependent capacity

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 7

header("Demo 2 - data-dependent capacity: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")

Demo 3: why VC can be too coarse

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 9

header("Demo 3 - why VC can be too coarse: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")

Demo 4: random signs

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 11

header("Demo 4 - random signs: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
    predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")

Demo 5: empirical complexity

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 13

header("Demo 5 - empirical complexity: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")

Demo 6: Rademacher variables $\sigma_i$

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 15

header("Demo 6 - Rademacher variables $\\sigma_i$: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")

Demo 7: empirical Rademacher complexity $\widehat{\mathfrak{R}}_S(\mathcal{H})$

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 17

header("Demo 7 - empirical Rademacher complexity $\\widehat{\\mathfrak{R}}_S(\\mathcal{H})$: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")

Demo 8: expected Rademacher complexity

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 19

header("Demo 8 - expected Rademacher complexity: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")

Demo 9: function classes

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 21

header("Demo 9 - function classes: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")

Demo 10: loss-composed classes

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 23

header("Demo 10 - loss-composed classes: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
    predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")

Demo 11: finite class bound

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 25

header("Demo 11 - finite class bound: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")

Demo 12: linear predictors with norm constraints

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 27

header("Demo 12 - linear predictors with norm constraints: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")

Demo 13: effect of sample norm

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 29

header("Demo 13 - effect of sample norm: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")

Demo 14: Monte Carlo estimation

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 31

header("Demo 14 - Monte Carlo estimation: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")

Demo 15: comparison to VC

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 33

header("Demo 15 - comparison to VC: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")

Demo 16: symmetrization idea

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 35

header("Demo 16 - symmetrization idea: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
    predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")

Demo 17: contraction lemma preview

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 37

header("Demo 17 - contraction lemma preview: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")

Demo 18: bounded-loss bound

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 39

header("Demo 18 - bounded-loss bound: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")

Demo 19: regularized ERM interpretation

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 41

header("Demo 19 - regularized ERM interpretation: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")

Demo 20: sample complexity

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 43

header("Demo 20 - sample complexity: empirical and true risk")
y_true = np.array([1, 0, 1, 1, 0, 0, 1, 0])
y_pred = np.array([1, 0, 0, 1, 0, 1, 1, 0])
empirical_risk = np.mean(y_true != y_pred)
print("Empirical risk:", empirical_risk)
check_close(empirical_risk, 0.25, name="classification empirical risk")
print("Learning-theory lesson: risk is a loss expectation, estimated from finite samples.")

Demo 21: norm-controlled predictors

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 45

header("Demo 21 - norm-controlled predictors: VC growth comparison")
m_values = np.arange(1, 11)
d = 3
vc_growth = np.array([sum(math.comb(int(m), i) for i in range(min(d, int(m)) + 1)) for m in m_values])
all_labelings = 2 ** m_values
print("m values:", m_values.tolist())
print("VC upper growth:", vc_growth.tolist())
check_true(np.all(vc_growth <= all_labelings), "growth function is bounded by all labelings")
fig, ax = plt.subplots()
ax.plot(m_values, all_labelings, color=COLORS["secondary"], label="All dichotomies")
ax.plot(m_values, vc_growth, color=COLORS["primary"], label="VC polynomial bound")
ax.set_title("Exponential labelings versus VC growth")
ax.set_xlabel("Sample size $m$")
ax.set_ylabel("Dichotomy count")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Learning-theory lesson: finite VC dimension turns exponential growth into polynomial growth.")

Demo 22: kernels and RKHS preview

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 47

header("Demo 22 - kernels and RKHS preview: bias-variance components")
x = np.linspace(-1, 1, 40)
y_true = x ** 2
predictions = []
for shift in [-0.15, -0.05, 0.05, 0.15]:
    predictions.append((x + shift) ** 2 + 0.02 * shift)
y_hats = np.vstack(predictions)
bias2, variance, noise = bias_variance(y_hats, y_true, noise_var=0.01)
print("Bias^2:", round(bias2, 6))
print("Variance:", round(variance, 6))
print("Noise:", round(noise, 6))
check_true(bias2 >= 0 and variance >= 0 and noise >= 0, "components are nonnegative")
print("Learning-theory lesson: expected error can be decomposed into interpretable terms.")

Demo 23: neural network norm bounds

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 49

header("Demo 23 - neural network norm bounds: Hoeffding generalization gap")
h_size = 64
m = 500
delta = 0.05
gap = hoeffding_gap(h_size, m, delta)
print("Generalization gap bound:", round(gap, 4))
check_true(0 < gap < 1, "gap bound is a probability-scale quantity")
print("Learning-theory lesson: more samples reduce the confidence penalty.")

Demo 24: adversarial robustness preview

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 51

header("Demo 24 - adversarial robustness preview: empirical Rademacher complexity")
x = np.linspace(-1, 1, 80)
rad = empirical_rademacher_linear(x, radius=2.0, trials=300)
print("Estimated empirical Rademacher complexity:", round(rad, 6))
check_true(rad >= 0, "Rademacher complexity is nonnegative")
print("Learning-theory lesson: fitting random signs is a data-dependent capacity test.")

Demo 25: representation learning caveats

This demo turns the approved TOC concept into a small executable learning-theory object.

Code cell 53

header("Demo 25 - representation learning caveats: finite-class sample complexity")
h_size = 128
epsilon = 0.05
delta = 0.01
m = finite_class_bound(h_size, epsilon, delta)
print("Hypothesis count:", h_size)
print("Required samples:", int(np.ceil(m)))
check_true(m > 0, "sample complexity is positive")
print("Learning-theory lesson: logarithmic class-size dependence is powerful but not magic.")

Previous lesson Next lesson