Theory Notebook

Converted from theory.ipynb for web reading.

Kernel Methods - Theory Notebook

This notebook is the executable companion to notes.md. It turns kernel definitions into concrete computations: Gram matrices, PSD checks, feature maps, RKHS-style prediction, kernel ridge regression, kernel PCA, Gaussian process posterior inference, scalable approximations, and NTK intuition.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    if not ok:
        raise AssertionError(name)

def check_close(name, actual, expected, tol=1e-8):
    ok = np.allclose(actual, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: max error={np.max(np.abs(np.asarray(actual)-np.asarray(expected))):.3e}")
    if not ok:
        raise AssertionError(name)

def linear_kernel(X, Z=None):
    Z = X if Z is None else Z
    return X @ Z.T

def polynomial_kernel(X, Z=None, degree=2, c=1.0):
    Z = X if Z is None else Z
    return (X @ Z.T + c) ** degree

def rbf_kernel(X, Z=None, gamma=1.0):
    Z = X if Z is None else Z
    X2 = np.sum(X * X, axis=1, keepdims=True)
    Z2 = np.sum(Z * Z, axis=1, keepdims=True).T
    D2 = np.maximum(X2 + Z2 - 2 * X @ Z.T, 0.0)
    return np.exp(-gamma * D2)

def laplacian_kernel(X, Z=None, gamma=1.0):
    Z = X if Z is None else Z
    D = np.sum(np.abs(X[:, None, :] - Z[None, :, :]), axis=2)
    return np.exp(-gamma * D)

def matern32_kernel(X, Z=None, ell=1.0):
    Z = X if Z is None else Z
    D = np.sqrt(np.maximum(np.sum((X[:, None, :] - Z[None, :, :])**2, axis=2), 0.0))
    a = np.sqrt(3.0) * D / ell
    return (1.0 + a) * np.exp(-a)

def normalize_kernel(K, eps=1e-12):
    d = np.sqrt(np.maximum(np.diag(K), eps))
    return K / (d[:, None] * d[None, :])

def center_kernel(K):
    n = K.shape[0]
    H = np.eye(n) - np.ones((n, n)) / n
    return H @ K @ H

def psd_eigenvalues(K):
    return np.linalg.eigvalsh((K + K.T) / 2)

def make_rings(n_per_class=80, noise=0.07):
    theta0 = np.random.uniform(0, 2*np.pi, n_per_class)
    theta1 = np.random.uniform(0, 2*np.pi, n_per_class)
    inner = np.column_stack([np.cos(theta0), np.sin(theta0)]) * 0.7
    outer = np.column_stack([np.cos(theta1), np.sin(theta1)]) * 1.6
    X = np.vstack([inner, outer]) + noise * np.random.randn(2*n_per_class, 2)
    y = np.r_[np.ones(n_per_class), -np.ones(n_per_class)]
    return X, y

def train_test_grid(xmin=-2.4, xmax=2.4, ymin=-2.4, ymax=2.4, steps=160):
    gx, gy = np.meshgrid(np.linspace(xmin, xmax, steps), np.linspace(ymin, ymax, steps))
    grid = np.column_stack([gx.ravel(), gy.ravel()])
    return gx, gy, grid

print("Kernel helper functions ready.")

1. Nonlinear data and the need for feature spaces

A kernel method often starts with data that is not linearly separable in the original coordinates. The rings dataset below is the canonical visual example.

Code cell 5

header("1.1 Generate nonlinear ring data")
X, y = make_rings(n_per_class=80, noise=0.07)
fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(X[y == 1, 0], X[y == 1, 1], color=COLORS["primary"], label="class +1", alpha=0.8)
ax.scatter(X[y == -1, 0], X[y == -1, 1], color=COLORS["secondary"], label="class -1", alpha=0.8)
ax.set_title("Concentric rings are not linearly separable")
ax.set_xlabel("$x_1$")
ax.set_ylabel("$x_2$")
ax.legend()
fig.tight_layout()
plt.show()
print(f"Dataset shape: {X.shape}, label balance: {np.bincount((y < 0).astype(int)).tolist()}")

2. Feature maps and polynomial kernels

The degree-2 homogeneous polynomial kernel has an explicit feature map in two dimensions. This cell verifies the identity numerically.

Code cell 7

header("2.1 Explicit feature map for a degree-2 polynomial kernel")
def phi_degree2_homogeneous(X):
    x1, x2 = X[:, 0], X[:, 1]
    return np.column_stack([x1**2, np.sqrt(2)*x1*x2, x2**2])

A = np.random.randn(5, 2)
B = np.random.randn(7, 2)
explicit = phi_degree2_homogeneous(A) @ phi_degree2_homogeneous(B).T
implicit = (A @ B.T) ** 2
check_close("explicit feature map equals polynomial kernel", explicit, implicit, tol=1e-10)
print("Feature dimension for 2D degree-2 homogeneous polynomial:", phi_degree2_homogeneous(A).shape[1])

3. Gram matrices

The Gram matrix stores pairwise kernel values over a finite dataset. For valid kernels, the Gram matrix is positive semidefinite for every finite dataset.

Code cell 9

header("3.1 Compare linear, polynomial, and RBF Gram matrices")
K_linear = linear_kernel(X)
K_poly = polynomial_kernel(X, degree=3, c=1.0)
K_rbf = rbf_kernel(X, gamma=1.5)
for name, K in [("linear", K_linear), ("polynomial", K_poly), ("RBF", K_rbf)]:
    eig = psd_eigenvalues(K)
    print(f"{name:10s}: shape={K.shape}, min eigenvalue={eig.min(): .3e}, max eigenvalue={eig.max(): .3e}")
    check_true(f"{name} Gram matrix is symmetric", np.allclose(K, K.T, atol=1e-10))
    check_true(f"{name} Gram matrix is PSD up to tolerance", eig.min() > -1e-8)

Code cell 10

header("3.2 Visualize Gram matrices")
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, K, title in zip(axes, [K_linear, K_poly, K_rbf], ["Linear", "Polynomial", "RBF"]):
    im = ax.imshow(K, cmap="viridis", aspect="auto")
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    ax.set_title(f"{title} Gram matrix")
    ax.set_xlabel("Example index")
    ax.set_ylabel("Example index")
fig.tight_layout()
plt.show()
print("Displayed three Gram matrices.")

4. Counterexamples

Symmetry alone is not enough. A similarity can look reasonable and still fail the PSD condition.

Code cell 12

header("4.1 A symmetric similarity that is not PSD")
points = np.array([[-1.0], [0.0], [1.0]])
K_bad = -((points - points.T) ** 2)
eig_bad = psd_eigenvalues(K_bad)
print("Bad Gram matrix:\n", K_bad)
print("Eigenvalues:", eig_bad)
check_true("bad similarity has a negative eigenvalue", eig_bad.min() < -1e-8)

5. Kernel normalization

Kernel normalization converts feature-space inner products into feature-space cosine similarities.

Code cell 14

header("5.1 Normalize an RBF Gram matrix")
K_norm = normalize_kernel(K_rbf)
check_close("normalized diagonal is one", np.diag(K_norm), np.ones(K_norm.shape[0]), tol=1e-10)
print("First 5 normalized kernel values in row 0:", np.round(K_norm[0, :5], 4))

6. Kernel centering

Kernel PCA needs centered feature vectors. Centering is performed directly on the Gram matrix with $K_c = HKH$ .

Code cell 16

header("6.1 Center a kernel matrix")
Kc = center_kernel(K_rbf)
row_sum_norm = np.linalg.norm(Kc.sum(axis=1))
col_sum_norm = np.linalg.norm(Kc.sum(axis=0))
print(f"Row-sum norm after centering: {row_sum_norm:.3e}")
print(f"Column-sum norm after centering: {col_sum_norm:.3e}")
check_true("centered kernel row sums are near zero", row_sum_norm < 1e-8)
check_true("centered kernel column sums are near zero", col_sum_norm < 1e-8)

Code cell 17

header("6.2 Visualize centered versus uncentered kernel")
fig, axes = plt.subplots(1, 2, figsize=(11, 4))
for ax, K, title in zip(axes, [K_rbf, Kc], ["Uncentered RBF kernel", "Centered RBF kernel"]):
    im = ax.imshow(K, cmap="RdBu_r", aspect="auto")
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    ax.set_title(title)
    ax.set_xlabel("Example index")
    ax.set_ylabel("Example index")
fig.tight_layout()
plt.show()
print("Displayed uncentered and centered kernels.")

7. Kernel bandwidth

For the RBF kernel, bandwidth controls locality. The eigenvalue profile changes dramatically as $\gamma$ changes.

Code cell 19

header("7.1 RBF bandwidth and eigenvalue spectra")
gammas = [0.05, 0.3, 1.5, 8.0]
fig, ax = plt.subplots(figsize=(9, 5))
for gamma, color in zip(gammas, [COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]]):
    K = rbf_kernel(X, gamma=gamma)
    eig = np.sort(psd_eigenvalues(K))[::-1]
    ax.plot(eig[:40], marker="o", markersize=3, label=f"$\\gamma={gamma}$", color=color)
ax.set_title("RBF bandwidth changes kernel eigenvalue decay")
ax.set_xlabel("Eigenvalue index")
ax.set_ylabel("Eigenvalue")
ax.legend()
fig.tight_layout()
plt.show()
print("Compared eigenvalue spectra for four RBF bandwidths.")

8. Common kernel families

Different kernels encode different smoothness and similarity assumptions.

Code cell 21

header("8.1 Compare radial kernels as a function of distance")
r = np.linspace(0, 4, 400)
rbf_vals = np.exp(-0.8 * r**2)
lap_vals = np.exp(-1.2 * r)
mat32_vals = (1 + np.sqrt(3)*r) * np.exp(-np.sqrt(3)*r)
mat52_vals = (1 + np.sqrt(5)*r + 5*r**2/3) * np.exp(-np.sqrt(5)*r)

fig, ax = plt.subplots(figsize=(9, 5))
ax.plot(r, rbf_vals, label="RBF", color=COLORS["primary"])
ax.plot(r, lap_vals, label="Laplacian", color=COLORS["secondary"])
ax.plot(r, mat32_vals, label="Matern 3/2", color=COLORS["tertiary"])
ax.plot(r, mat52_vals, label="Matern 5/2", color=COLORS["highlight"])
ax.set_title("Radial kernel profiles")
ax.set_xlabel("Distance $r$")
ax.set_ylabel("$k(r)$")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed radial kernel profiles.")

Code cell 22

header("8.2 Kernel composition preserves PSD")
K_sum = K_rbf + 0.25 * K_linear
K_prod = K_rbf * normalize_kernel(polynomial_kernel(X, degree=2, c=1.0))
for name, K in [("sum kernel", K_sum), ("product kernel", K_prod)]:
    eig = psd_eigenvalues(K)
    print(f"{name}: min eigenvalue={eig.min():.3e}")
    check_true(f"{name} is PSD up to tolerance", eig.min() > -1e-8)

9. Kernel perceptron

The kernel perceptron is the simplest fully kernelized classifier. It stores mistake counts rather than an explicit feature-space weight vector.

Code cell 24

header("9.1 Train a kernel perceptron")
def kernel_perceptron_fit(X, y, gamma=2.0, epochs=8):
    K = rbf_kernel(X, gamma=gamma)
    alpha = np.zeros(len(y))
    mistakes = []
    for epoch in range(epochs):
        count = 0
        for i in range(len(y)):
            score = np.sum(alpha * y * K[:, i])
            if y[i] * score <= 0:
                alpha[i] += 1.0
                count += 1
        mistakes.append(count)
    return alpha, mistakes

alpha_p, mistakes = kernel_perceptron_fit(X, y, gamma=2.0, epochs=10)
support_like = np.sum(alpha_p > 0)
train_scores = rbf_kernel(X, gamma=2.0) @ (alpha_p * y)
train_acc = np.mean(np.sign(train_scores) == y)
print("Mistakes by epoch:", mistakes)
print(f"Active training points: {support_like}")
print(f"Training accuracy: {train_acc:.3f}")
check_true("kernel perceptron fits most ring points", train_acc > 0.95)

Code cell 25

header("9.2 Visualize kernel perceptron decision boundary")
gx, gy, grid = train_test_grid()
scores_grid = rbf_kernel(grid, X, gamma=2.0) @ (alpha_p * y)
Z = scores_grid.reshape(gx.shape)
fig, ax = plt.subplots(figsize=(7, 7))
ax.contourf(gx, gy, Z, levels=30, cmap="RdBu_r", alpha=0.45)
ax.contour(gx, gy, Z, levels=[0], colors="black", linewidths=2)
ax.scatter(X[y == 1, 0], X[y == 1, 1], color=COLORS["primary"], label="class +1", edgecolor="white", s=45)
ax.scatter(X[y == -1, 0], X[y == -1, 1], color=COLORS["secondary"], label="class -1", edgecolor="white", s=45)
ax.set_title("Kernel perceptron learns a nonlinear boundary")
ax.set_xlabel("$x_1$")
ax.set_ylabel("$x_2$")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed kernel perceptron decision boundary.")

10. SVM dual structure

The SVM dual objective uses only labels and kernel values. This cell builds the quadratic form used by the dual.

Code cell 27

header("10.1 Build the SVM dual quadratic form")
Q = (y[:, None] * y[None, :]) * K_rbf
eig_Q = psd_eigenvalues(Q)
print(f"Q shape: {Q.shape}")
print(f"Q min eigenvalue: {eig_Q.min():.3e}")
check_true("SVM dual Q matrix is PSD up to tolerance", eig_Q.min() > -1e-8)
print("Dual objective: sum(alpha) - 0.5 * alpha.T @ Q @ alpha with box/equality constraints.")

11. Kernel ridge regression

Kernel ridge regression gives a closed-form nonlinear smoother.

Code cell 29

header("11.1 Fit kernel ridge regression to noisy data")
def krr_fit(X_train, y_train, gamma=5.0, lam=1e-2):
    K = rbf_kernel(X_train, gamma=gamma)
    alpha = np.linalg.solve(K + lam * np.eye(len(X_train)), y_train)
    return alpha

def krr_predict(X_train, alpha, X_test, gamma=5.0):
    return rbf_kernel(X_test, X_train, gamma=gamma) @ alpha

x_train = np.linspace(-3, 3, 45)[:, None]
y_true = np.sin(2.2 * x_train[:, 0]) + 0.25 * x_train[:, 0]
y_train = y_true + 0.18 * np.random.randn(len(x_train))
x_test = np.linspace(-3.5, 3.5, 300)[:, None]
alpha_krr = krr_fit(x_train, y_train, gamma=3.0, lam=0.05)
y_pred = krr_predict(x_train, alpha_krr, x_test, gamma=3.0)
print(f"KRR coefficient norm: {np.linalg.norm(alpha_krr):.3f}")
check_true("KRR prediction has correct shape", y_pred.shape == (len(x_test),))

Code cell 30

header("11.2 Visualize KRR fit")
fig, ax = plt.subplots(figsize=(9, 5))
ax.scatter(x_train[:, 0], y_train, color=COLORS["neutral"], label="noisy training data", alpha=0.75)
ax.plot(x_test[:, 0], y_pred, color=COLORS["primary"], label="KRR prediction")
ax.plot(x_test[:, 0], np.sin(2.2*x_test[:, 0]) + 0.25*x_test[:, 0], color=COLORS["secondary"], linestyle="--", label="true function")
ax.set_title("Kernel ridge regression with an RBF kernel")
ax.set_xlabel("$x$")
ax.set_ylabel("$f(x)$")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed KRR regression curve.")

Code cell 31

header("11.3 Regularization path for KRR")
lams = [1e-4, 1e-2, 1e-1, 1.0]
fig, ax = plt.subplots(figsize=(9, 5))
ax.scatter(x_train[:, 0], y_train, color=COLORS["neutral"], alpha=0.35, label="data")
for lam, color in zip(lams, [COLORS["highlight"], COLORS["primary"], COLORS["secondary"], COLORS["tertiary"]]):
    a = krr_fit(x_train, y_train, gamma=3.0, lam=lam)
    p = krr_predict(x_train, a, x_test, gamma=3.0)
    ax.plot(x_test[:, 0], p, color=color, label=f"$\\lambda={lam}$")
ax.set_title("Regularization changes KRR smoothness")
ax.set_xlabel("$x$")
ax.set_ylabel("prediction")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed KRR regularization path.")

12. Kernel PCA

Kernel PCA performs PCA in feature space through the centered Gram matrix.

Code cell 33

header("12.1 Compute kernel PCA embedding")
def kernel_pca_embedding(X, gamma=1.0, n_components=2):
    K = center_kernel(rbf_kernel(X, gamma=gamma))
    eigvals, eigvecs = np.linalg.eigh((K + K.T) / 2)
    order = np.argsort(eigvals)[::-1]
    eigvals = eigvals[order]
    eigvecs = eigvecs[:, order]
    positive = np.maximum(eigvals[:n_components], 0.0)
    embedding = eigvecs[:, :n_components] * np.sqrt(positive)
    return embedding, eigvals

emb, eigvals_kpca = kernel_pca_embedding(X, gamma=1.5, n_components=2)
print("Top 5 centered-kernel eigenvalues:", np.round(eigvals_kpca[:5], 4))
check_true("kernel PCA embedding has two coordinates", emb.shape == (len(X), 2))

Code cell 34

header("12.2 Visualize kernel PCA embedding")
fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(emb[y == 1, 0], emb[y == 1, 1], color=COLORS["primary"], label="class +1", alpha=0.8)
ax.scatter(emb[y == -1, 0], emb[y == -1, 1], color=COLORS["secondary"], label="class -1", alpha=0.8)
ax.set_title("Kernel PCA embedding separates nonlinear rings")
ax.set_xlabel("Kernel PC 1")
ax.set_ylabel("Kernel PC 2")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed kernel PCA embedding.")

13. Gaussian process regression

A Gaussian process uses the kernel as a covariance function.

Code cell 36

header("13.1 Compute GP posterior mean and variance")
def gp_posterior(X_train, y_train, X_test, gamma=2.0, noise=0.08, jitter=1e-8):
    K = rbf_kernel(X_train, gamma=gamma) + (noise**2 + jitter) * np.eye(len(X_train))
    Ks = rbf_kernel(X_train, X_test, gamma=gamma)
    Kss_diag = np.ones(len(X_test))
    L = np.linalg.cholesky(K)
    alpha = np.linalg.solve(L.T, np.linalg.solve(L, y_train))
    mean = Ks.T @ alpha
    v = np.linalg.solve(L, Ks)
    var = np.maximum(Kss_diag - np.sum(v*v, axis=0), 0.0)
    return mean, var

idx = np.linspace(0, len(x_train)-1, 18).astype(int)
X_gp = x_train[idx]
y_gp = y_train[idx]
gp_mean, gp_var = gp_posterior(X_gp, y_gp, x_test, gamma=2.5, noise=0.12)
print(f"GP posterior mean shape: {gp_mean.shape}")
print(f"GP posterior variance min/max: {gp_var.min():.4f}/{gp_var.max():.4f}")
check_true("GP variances are nonnegative", np.all(gp_var >= -1e-10))

Code cell 37

header("13.2 Visualize GP posterior")
std = np.sqrt(gp_var)
fig, ax = plt.subplots(figsize=(9, 5))
ax.scatter(X_gp[:, 0], y_gp, color=COLORS["neutral"], label="observed data", zorder=3)
ax.plot(x_test[:, 0], gp_mean, color=COLORS["primary"], label="posterior mean")
ax.fill_between(x_test[:, 0], gp_mean - 2*std, gp_mean + 2*std, color=COLORS["primary"], alpha=0.2, label="95% band")
ax.set_title("Gaussian process posterior uncertainty")
ax.set_xlabel("$x$")
ax.set_ylabel("$f(x)$")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed GP posterior mean and uncertainty band.")

14. Kernel matrix cost

Exact kernels are limited by the $n$ by $n$ Gram matrix.

Code cell 39

header("14.1 Estimate kernel memory cost")
ns = np.array([1_000, 5_000, 10_000, 50_000, 100_000])
gb_float64 = ns**2 * 8 / 1e9
for n, gb in zip(ns, gb_float64):
    print(f"n={n:6d}: float64 Gram matrix approx {gb:8.2f} GB")
check_true("memory grows quadratically", gb_float64[-1] / gb_float64[-2] > 3.5)

15. Conditioning

Ridge regularization shifts Gram-matrix eigenvalues and improves numerical stability.

Code cell 41

header("15.1 Ridge shift improves conditioning")
K_ill = rbf_kernel(x_train, gamma=0.02)
eig = psd_eigenvalues(K_ill)
for lam in [0.0, 1e-6, 1e-3, 1e-1]:
    shifted = eig + lam
    cond = shifted.max() / max(shifted.min(), 1e-15)
    print(f"lambda={lam:8g}: min eig={shifted.min():.3e}, condition approx={cond:.3e}")
check_true("ridge shift raises the smallest eigenvalue", (eig + 1e-3).min() > eig.min())

16. Nystrom approximation

Nystrom methods approximate the full Gram matrix using landmark points.

Code cell 43

header("16.1 Nystrom approximation of an RBF Gram matrix")
def nystrom_approximation(X, m=25, gamma=1.5, jitter=1e-6):
    idx = np.random.choice(len(X), size=m, replace=False)
    landmarks = X[idx]
    C = rbf_kernel(X, landmarks, gamma=gamma)
    W = rbf_kernel(landmarks, gamma=gamma)
    W_inv = np.linalg.pinv(W + jitter * np.eye(m))
    return C @ W_inv @ C.T, idx

K_full = rbf_kernel(X, gamma=1.5)
errors = []
ms = [5, 10, 20, 40, 80]
for m in ms:
    K_approx, _ = nystrom_approximation(X, m=m, gamma=1.5)
    rel = np.linalg.norm(K_full - K_approx, "fro") / np.linalg.norm(K_full, "fro")
    errors.append(rel)
    print(f"m={m:3d}: relative Frobenius error={rel:.4f}")
check_true("larger landmark budget improves approximation overall", errors[-1] < errors[0])

Code cell 44

header("16.2 Plot Nystrom error curve")
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(ms, errors, marker="o", color=COLORS["primary"])
ax.set_title("Nystrom approximation improves with more landmarks")
ax.set_xlabel("Number of landmarks $m$")
ax.set_ylabel("Relative Frobenius error")
fig.tight_layout()
plt.show()
print("Displayed Nystrom approximation error curve.")

17. Random Fourier features

Random Fourier features approximate shift-invariant kernels with explicit random features.

Code cell 46

header("17.1 Random Fourier feature map")
def rff_features(X, D=200, gamma=1.0, omega=None, b=None):
    d = X.shape[1]
    if omega is None:
        omega = np.random.randn(D, d) * np.sqrt(2 * gamma)
    if b is None:
        b = np.random.uniform(0, 2*np.pi, size=D)
    Z = np.sqrt(2.0 / D) * np.cos(X @ omega.T + b)
    return Z, omega, b

Ds = [25, 50, 100, 250, 500]
rff_errors = []
K_target = rbf_kernel(X, gamma=1.0)
for D in Ds:
    Z, _, _ = rff_features(X, D=D, gamma=1.0)
    K_hat = Z @ Z.T
    rel = np.linalg.norm(K_target - K_hat, "fro") / np.linalg.norm(K_target, "fro")
    rff_errors.append(rel)
    print(f"D={D:4d}: relative kernel approximation error={rel:.4f}")
check_true("RFF approximation gets reasonable by D=500", rff_errors[-1] < 0.45)

Code cell 47

header("17.2 Plot RFF approximation error")
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(Ds, rff_errors, marker="o", color=COLORS["secondary"])
ax.set_title("Random Fourier feature approximation")
ax.set_xlabel("Number of random features $D$")
ax.set_ylabel("Relative Frobenius error")
fig.tight_layout()
plt.show()
print("Displayed RFF approximation error curve.")

18. Random features turn kernels into linear models

Once random features are explicit, ridge regression can be done with a linear design matrix.

Code cell 49

header("18.1 Ridge regression on random Fourier features")
Z_train, omega, b = rff_features(x_train, D=500, gamma=3.0)
Z_test, _, _ = rff_features(x_test, D=500, gamma=3.0, omega=omega, b=b)
lam = 0.05
w_rff = np.linalg.solve(Z_train.T @ Z_train + lam * np.eye(Z_train.shape[1]), Z_train.T @ y_train)
y_rff = Z_test @ w_rff
y_exact = krr_predict(x_train, krr_fit(x_train, y_train, gamma=3.0, lam=lam), x_test, gamma=3.0)
rmse_between = np.sqrt(np.mean((y_rff - y_exact)**2))
print(f"RMSE between exact KRR and RFF ridge predictions: {rmse_between:.4f}")
check_true("RFF ridge tracks exact KRR reasonably", rmse_between < 0.35)

Code cell 50

header("18.2 Visualize exact KRR versus RFF ridge")
fig, ax = plt.subplots(figsize=(9, 5))
ax.scatter(x_train[:, 0], y_train, color=COLORS["neutral"], alpha=0.35, label="data")
ax.plot(x_test[:, 0], y_exact, color=COLORS["primary"], label="exact KRR")
ax.plot(x_test[:, 0], y_rff, color=COLORS["secondary"], linestyle="--", label="RFF ridge")
ax.set_title("Random features approximate kernel ridge regression")
ax.set_xlabel("$x$")
ax.set_ylabel("prediction")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed exact KRR and RFF ridge comparison.")

19. Similarity, embeddings, and cosine kernels

Normalized dot product is a kernel over normalized vectors.

Code cell 52

header("19.1 Cosine kernel over synthetic embeddings")
E = np.random.randn(12, 5)
E = E / np.linalg.norm(E, axis=1, keepdims=True)
K_cos = E @ E.T
eig_cos = psd_eigenvalues(K_cos)
print("Cosine Gram diagonal:", np.round(np.diag(K_cos), 3))
print("Minimum eigenvalue:", eig_cos.min())
check_true("cosine Gram over normalized embeddings is PSD", eig_cos.min() > -1e-10)

20. Attention and kernel feature maps

Some efficient attention mechanisms approximate exponential dot-product kernels with feature maps.

Code cell 54

header("20.1 Softmax-style exponential kernel is positive on pairs but not automatically normalized")
Q_small = np.random.randn(6, 4)
K_exp = np.exp(Q_small @ Q_small.T / np.sqrt(Q_small.shape[1]))
eig_exp = psd_eigenvalues(K_exp)
print("Exponential dot-product Gram min eigenvalue:", eig_exp.min())
check_true("exp dot-product Gram is PSD on this finite set", eig_exp.min() > -1e-8)
row_softmax = K_exp / K_exp.sum(axis=1, keepdims=True)
print("Row-stochastic attention row sums:", np.round(row_softmax.sum(axis=1), 6))
check_true("row-normalized attention is not symmetric in general", not np.allclose(row_softmax, row_softmax.T))

21. Neural tangent kernel

The empirical NTK is a Gram matrix of parameter gradients. This small finite-difference example avoids autodiff dependencies.

Code cell 56

header("21.1 Empirical NTK by finite differences")
def unpack(theta, width=8):
    idx = 0
    W1 = theta[idx:idx+width].reshape(width, 1); idx += width
    b1 = theta[idx:idx+width]; idx += width
    W2 = theta[idx:idx+width].reshape(1, width); idx += width
    b2 = theta[idx]
    return W1, b1, W2, b2

def tiny_net(theta, X1, width=8):
    W1, b1, W2, b2 = unpack(theta, width)
    H = np.tanh(X1 @ W1.T + b1)
    return (H @ W2.T).ravel() + b2

def finite_difference_jacobian(theta, X1, eps=1e-5):
    base = tiny_net(theta, X1)
    J = np.zeros((len(X1), len(theta)))
    for p in range(len(theta)):
        step = np.zeros_like(theta)
        step[p] = eps
        J[:, p] = (tiny_net(theta + step, X1) - tiny_net(theta - step, X1)) / (2 * eps)
    return J

X_ntk = np.linspace(-2, 2, 20)[:, None]
width = 8
theta = 0.5 * np.random.randn(width + width + width + 1)
J = finite_difference_jacobian(theta, X_ntk)
Theta = J @ J.T
eig_ntk = psd_eigenvalues(Theta)
print(f"Jacobian shape: {J.shape}")
print(f"NTK Gram min eigenvalue: {eig_ntk.min():.3e}")
check_true("empirical NTK is PSD up to numerical tolerance", eig_ntk.min() > -1e-8)

Code cell 57

header("21.2 Visualize empirical NTK")
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(Theta, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, label="NTK value")
ax.set_title("Empirical NTK Gram matrix")
ax.set_xlabel("Input index")
ax.set_ylabel("Input index")
fig.tight_layout()
plt.show()
print("Displayed empirical NTK Gram matrix.")

22. End-to-end summary

Kernel methods reduce function-space learning to finite kernel computations. The notebook has now touched the full arc from definitions to scalable approximations and deep-learning theory.

Code cell 59

header("22.1 Final notebook checks")
checks = {
    "RBF Gram PSD": psd_eigenvalues(K_rbf).min() > -1e-8,
    "KRR prediction finite": np.all(np.isfinite(y_pred)),
    "Kernel PCA finite": np.all(np.isfinite(emb)),
    "GP variance finite": np.all(np.isfinite(gp_var)),
    "RFF error finite": np.all(np.isfinite(rff_errors)),
    "NTK PSD": psd_eigenvalues(Theta).min() > -1e-8,
}
for name, ok in checks.items():
    check_true(name, ok)
print("Kernel methods theory notebook completed successfully.")