Theory Notebook
Converted from
theory.ipynbfor web reading.
Kernel Methods - Theory Notebook
This notebook is the executable companion to notes.md.
It turns kernel definitions into concrete computations: Gram matrices, PSD checks, feature maps, RKHS-style prediction, kernel ridge regression, kernel PCA, Gaussian process posterior inference, scalable approximations, and NTK intuition.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 78)
print(title)
print("=" * 78)
def check_true(name, condition):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
if not ok:
raise AssertionError(name)
def check_close(name, actual, expected, tol=1e-8):
ok = np.allclose(actual, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: max error={np.max(np.abs(np.asarray(actual)-np.asarray(expected))):.3e}")
if not ok:
raise AssertionError(name)
def linear_kernel(X, Z=None):
Z = X if Z is None else Z
return X @ Z.T
def polynomial_kernel(X, Z=None, degree=2, c=1.0):
Z = X if Z is None else Z
return (X @ Z.T + c) ** degree
def rbf_kernel(X, Z=None, gamma=1.0):
Z = X if Z is None else Z
X2 = np.sum(X * X, axis=1, keepdims=True)
Z2 = np.sum(Z * Z, axis=1, keepdims=True).T
D2 = np.maximum(X2 + Z2 - 2 * X @ Z.T, 0.0)
return np.exp(-gamma * D2)
def laplacian_kernel(X, Z=None, gamma=1.0):
Z = X if Z is None else Z
D = np.sum(np.abs(X[:, None, :] - Z[None, :, :]), axis=2)
return np.exp(-gamma * D)
def matern32_kernel(X, Z=None, ell=1.0):
Z = X if Z is None else Z
D = np.sqrt(np.maximum(np.sum((X[:, None, :] - Z[None, :, :])**2, axis=2), 0.0))
a = np.sqrt(3.0) * D / ell
return (1.0 + a) * np.exp(-a)
def normalize_kernel(K, eps=1e-12):
d = np.sqrt(np.maximum(np.diag(K), eps))
return K / (d[:, None] * d[None, :])
def center_kernel(K):
n = K.shape[0]
H = np.eye(n) - np.ones((n, n)) / n
return H @ K @ H
def psd_eigenvalues(K):
return np.linalg.eigvalsh((K + K.T) / 2)
def make_rings(n_per_class=80, noise=0.07):
theta0 = np.random.uniform(0, 2*np.pi, n_per_class)
theta1 = np.random.uniform(0, 2*np.pi, n_per_class)
inner = np.column_stack([np.cos(theta0), np.sin(theta0)]) * 0.7
outer = np.column_stack([np.cos(theta1), np.sin(theta1)]) * 1.6
X = np.vstack([inner, outer]) + noise * np.random.randn(2*n_per_class, 2)
y = np.r_[np.ones(n_per_class), -np.ones(n_per_class)]
return X, y
def train_test_grid(xmin=-2.4, xmax=2.4, ymin=-2.4, ymax=2.4, steps=160):
gx, gy = np.meshgrid(np.linspace(xmin, xmax, steps), np.linspace(ymin, ymax, steps))
grid = np.column_stack([gx.ravel(), gy.ravel()])
return gx, gy, grid
print("Kernel helper functions ready.")
1. Nonlinear data and the need for feature spaces
A kernel method often starts with data that is not linearly separable in the original coordinates. The rings dataset below is the canonical visual example.
Code cell 5
header("1.1 Generate nonlinear ring data")
X, y = make_rings(n_per_class=80, noise=0.07)
fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(X[y == 1, 0], X[y == 1, 1], color=COLORS["primary"], label="class +1", alpha=0.8)
ax.scatter(X[y == -1, 0], X[y == -1, 1], color=COLORS["secondary"], label="class -1", alpha=0.8)
ax.set_title("Concentric rings are not linearly separable")
ax.set_xlabel("$x_1$")
ax.set_ylabel("$x_2$")
ax.legend()
fig.tight_layout()
plt.show()
print(f"Dataset shape: {X.shape}, label balance: {np.bincount((y < 0).astype(int)).tolist()}")
2. Feature maps and polynomial kernels
The degree-2 homogeneous polynomial kernel has an explicit feature map in two dimensions. This cell verifies the identity numerically.
Code cell 7
header("2.1 Explicit feature map for a degree-2 polynomial kernel")
def phi_degree2_homogeneous(X):
x1, x2 = X[:, 0], X[:, 1]
return np.column_stack([x1**2, np.sqrt(2)*x1*x2, x2**2])
A = np.random.randn(5, 2)
B = np.random.randn(7, 2)
explicit = phi_degree2_homogeneous(A) @ phi_degree2_homogeneous(B).T
implicit = (A @ B.T) ** 2
check_close("explicit feature map equals polynomial kernel", explicit, implicit, tol=1e-10)
print("Feature dimension for 2D degree-2 homogeneous polynomial:", phi_degree2_homogeneous(A).shape[1])
3. Gram matrices
The Gram matrix stores pairwise kernel values over a finite dataset. For valid kernels, the Gram matrix is positive semidefinite for every finite dataset.
Code cell 9
header("3.1 Compare linear, polynomial, and RBF Gram matrices")
K_linear = linear_kernel(X)
K_poly = polynomial_kernel(X, degree=3, c=1.0)
K_rbf = rbf_kernel(X, gamma=1.5)
for name, K in [("linear", K_linear), ("polynomial", K_poly), ("RBF", K_rbf)]:
eig = psd_eigenvalues(K)
print(f"{name:10s}: shape={K.shape}, min eigenvalue={eig.min(): .3e}, max eigenvalue={eig.max(): .3e}")
check_true(f"{name} Gram matrix is symmetric", np.allclose(K, K.T, atol=1e-10))
check_true(f"{name} Gram matrix is PSD up to tolerance", eig.min() > -1e-8)
Code cell 10
header("3.2 Visualize Gram matrices")
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, K, title in zip(axes, [K_linear, K_poly, K_rbf], ["Linear", "Polynomial", "RBF"]):
im = ax.imshow(K, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
ax.set_title(f"{title} Gram matrix")
ax.set_xlabel("Example index")
ax.set_ylabel("Example index")
fig.tight_layout()
plt.show()
print("Displayed three Gram matrices.")
4. Counterexamples
Symmetry alone is not enough. A similarity can look reasonable and still fail the PSD condition.
Code cell 12
header("4.1 A symmetric similarity that is not PSD")
points = np.array([[-1.0], [0.0], [1.0]])
K_bad = -((points - points.T) ** 2)
eig_bad = psd_eigenvalues(K_bad)
print("Bad Gram matrix:\n", K_bad)
print("Eigenvalues:", eig_bad)
check_true("bad similarity has a negative eigenvalue", eig_bad.min() < -1e-8)
5. Kernel normalization
Kernel normalization converts feature-space inner products into feature-space cosine similarities.
Code cell 14
header("5.1 Normalize an RBF Gram matrix")
K_norm = normalize_kernel(K_rbf)
check_close("normalized diagonal is one", np.diag(K_norm), np.ones(K_norm.shape[0]), tol=1e-10)
print("First 5 normalized kernel values in row 0:", np.round(K_norm[0, :5], 4))
6. Kernel centering
Kernel PCA needs centered feature vectors. Centering is performed directly on the Gram matrix with .
Code cell 16
header("6.1 Center a kernel matrix")
Kc = center_kernel(K_rbf)
row_sum_norm = np.linalg.norm(Kc.sum(axis=1))
col_sum_norm = np.linalg.norm(Kc.sum(axis=0))
print(f"Row-sum norm after centering: {row_sum_norm:.3e}")
print(f"Column-sum norm after centering: {col_sum_norm:.3e}")
check_true("centered kernel row sums are near zero", row_sum_norm < 1e-8)
check_true("centered kernel column sums are near zero", col_sum_norm < 1e-8)
Code cell 17
header("6.2 Visualize centered versus uncentered kernel")
fig, axes = plt.subplots(1, 2, figsize=(11, 4))
for ax, K, title in zip(axes, [K_rbf, Kc], ["Uncentered RBF kernel", "Centered RBF kernel"]):
im = ax.imshow(K, cmap="RdBu_r", aspect="auto")
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
ax.set_title(title)
ax.set_xlabel("Example index")
ax.set_ylabel("Example index")
fig.tight_layout()
plt.show()
print("Displayed uncentered and centered kernels.")
7. Kernel bandwidth
For the RBF kernel, bandwidth controls locality. The eigenvalue profile changes dramatically as changes.
Code cell 19
header("7.1 RBF bandwidth and eigenvalue spectra")
gammas = [0.05, 0.3, 1.5, 8.0]
fig, ax = plt.subplots(figsize=(9, 5))
for gamma, color in zip(gammas, [COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"]]):
K = rbf_kernel(X, gamma=gamma)
eig = np.sort(psd_eigenvalues(K))[::-1]
ax.plot(eig[:40], marker="o", markersize=3, label=f"$\\gamma={gamma}$", color=color)
ax.set_title("RBF bandwidth changes kernel eigenvalue decay")
ax.set_xlabel("Eigenvalue index")
ax.set_ylabel("Eigenvalue")
ax.legend()
fig.tight_layout()
plt.show()
print("Compared eigenvalue spectra for four RBF bandwidths.")
8. Common kernel families
Different kernels encode different smoothness and similarity assumptions.
Code cell 21
header("8.1 Compare radial kernels as a function of distance")
r = np.linspace(0, 4, 400)
rbf_vals = np.exp(-0.8 * r**2)
lap_vals = np.exp(-1.2 * r)
mat32_vals = (1 + np.sqrt(3)*r) * np.exp(-np.sqrt(3)*r)
mat52_vals = (1 + np.sqrt(5)*r + 5*r**2/3) * np.exp(-np.sqrt(5)*r)
fig, ax = plt.subplots(figsize=(9, 5))
ax.plot(r, rbf_vals, label="RBF", color=COLORS["primary"])
ax.plot(r, lap_vals, label="Laplacian", color=COLORS["secondary"])
ax.plot(r, mat32_vals, label="Matern 3/2", color=COLORS["tertiary"])
ax.plot(r, mat52_vals, label="Matern 5/2", color=COLORS["highlight"])
ax.set_title("Radial kernel profiles")
ax.set_xlabel("Distance $r$")
ax.set_ylabel("$k(r)$")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed radial kernel profiles.")
Code cell 22
header("8.2 Kernel composition preserves PSD")
K_sum = K_rbf + 0.25 * K_linear
K_prod = K_rbf * normalize_kernel(polynomial_kernel(X, degree=2, c=1.0))
for name, K in [("sum kernel", K_sum), ("product kernel", K_prod)]:
eig = psd_eigenvalues(K)
print(f"{name}: min eigenvalue={eig.min():.3e}")
check_true(f"{name} is PSD up to tolerance", eig.min() > -1e-8)
9. Kernel perceptron
The kernel perceptron is the simplest fully kernelized classifier. It stores mistake counts rather than an explicit feature-space weight vector.
Code cell 24
header("9.1 Train a kernel perceptron")
def kernel_perceptron_fit(X, y, gamma=2.0, epochs=8):
K = rbf_kernel(X, gamma=gamma)
alpha = np.zeros(len(y))
mistakes = []
for epoch in range(epochs):
count = 0
for i in range(len(y)):
score = np.sum(alpha * y * K[:, i])
if y[i] * score <= 0:
alpha[i] += 1.0
count += 1
mistakes.append(count)
return alpha, mistakes
alpha_p, mistakes = kernel_perceptron_fit(X, y, gamma=2.0, epochs=10)
support_like = np.sum(alpha_p > 0)
train_scores = rbf_kernel(X, gamma=2.0) @ (alpha_p * y)
train_acc = np.mean(np.sign(train_scores) == y)
print("Mistakes by epoch:", mistakes)
print(f"Active training points: {support_like}")
print(f"Training accuracy: {train_acc:.3f}")
check_true("kernel perceptron fits most ring points", train_acc > 0.95)
Code cell 25
header("9.2 Visualize kernel perceptron decision boundary")
gx, gy, grid = train_test_grid()
scores_grid = rbf_kernel(grid, X, gamma=2.0) @ (alpha_p * y)
Z = scores_grid.reshape(gx.shape)
fig, ax = plt.subplots(figsize=(7, 7))
ax.contourf(gx, gy, Z, levels=30, cmap="RdBu_r", alpha=0.45)
ax.contour(gx, gy, Z, levels=[0], colors="black", linewidths=2)
ax.scatter(X[y == 1, 0], X[y == 1, 1], color=COLORS["primary"], label="class +1", edgecolor="white", s=45)
ax.scatter(X[y == -1, 0], X[y == -1, 1], color=COLORS["secondary"], label="class -1", edgecolor="white", s=45)
ax.set_title("Kernel perceptron learns a nonlinear boundary")
ax.set_xlabel("$x_1$")
ax.set_ylabel("$x_2$")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed kernel perceptron decision boundary.")
10. SVM dual structure
The SVM dual objective uses only labels and kernel values. This cell builds the quadratic form used by the dual.
Code cell 27
header("10.1 Build the SVM dual quadratic form")
Q = (y[:, None] * y[None, :]) * K_rbf
eig_Q = psd_eigenvalues(Q)
print(f"Q shape: {Q.shape}")
print(f"Q min eigenvalue: {eig_Q.min():.3e}")
check_true("SVM dual Q matrix is PSD up to tolerance", eig_Q.min() > -1e-8)
print("Dual objective: sum(alpha) - 0.5 * alpha.T @ Q @ alpha with box/equality constraints.")
11. Kernel ridge regression
Kernel ridge regression gives a closed-form nonlinear smoother.
Code cell 29
header("11.1 Fit kernel ridge regression to noisy data")
def krr_fit(X_train, y_train, gamma=5.0, lam=1e-2):
K = rbf_kernel(X_train, gamma=gamma)
alpha = np.linalg.solve(K + lam * np.eye(len(X_train)), y_train)
return alpha
def krr_predict(X_train, alpha, X_test, gamma=5.0):
return rbf_kernel(X_test, X_train, gamma=gamma) @ alpha
x_train = np.linspace(-3, 3, 45)[:, None]
y_true = np.sin(2.2 * x_train[:, 0]) + 0.25 * x_train[:, 0]
y_train = y_true + 0.18 * np.random.randn(len(x_train))
x_test = np.linspace(-3.5, 3.5, 300)[:, None]
alpha_krr = krr_fit(x_train, y_train, gamma=3.0, lam=0.05)
y_pred = krr_predict(x_train, alpha_krr, x_test, gamma=3.0)
print(f"KRR coefficient norm: {np.linalg.norm(alpha_krr):.3f}")
check_true("KRR prediction has correct shape", y_pred.shape == (len(x_test),))
Code cell 30
header("11.2 Visualize KRR fit")
fig, ax = plt.subplots(figsize=(9, 5))
ax.scatter(x_train[:, 0], y_train, color=COLORS["neutral"], label="noisy training data", alpha=0.75)
ax.plot(x_test[:, 0], y_pred, color=COLORS["primary"], label="KRR prediction")
ax.plot(x_test[:, 0], np.sin(2.2*x_test[:, 0]) + 0.25*x_test[:, 0], color=COLORS["secondary"], linestyle="--", label="true function")
ax.set_title("Kernel ridge regression with an RBF kernel")
ax.set_xlabel("$x$")
ax.set_ylabel("$f(x)$")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed KRR regression curve.")
Code cell 31
header("11.3 Regularization path for KRR")
lams = [1e-4, 1e-2, 1e-1, 1.0]
fig, ax = plt.subplots(figsize=(9, 5))
ax.scatter(x_train[:, 0], y_train, color=COLORS["neutral"], alpha=0.35, label="data")
for lam, color in zip(lams, [COLORS["highlight"], COLORS["primary"], COLORS["secondary"], COLORS["tertiary"]]):
a = krr_fit(x_train, y_train, gamma=3.0, lam=lam)
p = krr_predict(x_train, a, x_test, gamma=3.0)
ax.plot(x_test[:, 0], p, color=color, label=f"$\\lambda={lam}$")
ax.set_title("Regularization changes KRR smoothness")
ax.set_xlabel("$x$")
ax.set_ylabel("prediction")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed KRR regularization path.")
12. Kernel PCA
Kernel PCA performs PCA in feature space through the centered Gram matrix.
Code cell 33
header("12.1 Compute kernel PCA embedding")
def kernel_pca_embedding(X, gamma=1.0, n_components=2):
K = center_kernel(rbf_kernel(X, gamma=gamma))
eigvals, eigvecs = np.linalg.eigh((K + K.T) / 2)
order = np.argsort(eigvals)[::-1]
eigvals = eigvals[order]
eigvecs = eigvecs[:, order]
positive = np.maximum(eigvals[:n_components], 0.0)
embedding = eigvecs[:, :n_components] * np.sqrt(positive)
return embedding, eigvals
emb, eigvals_kpca = kernel_pca_embedding(X, gamma=1.5, n_components=2)
print("Top 5 centered-kernel eigenvalues:", np.round(eigvals_kpca[:5], 4))
check_true("kernel PCA embedding has two coordinates", emb.shape == (len(X), 2))
Code cell 34
header("12.2 Visualize kernel PCA embedding")
fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(emb[y == 1, 0], emb[y == 1, 1], color=COLORS["primary"], label="class +1", alpha=0.8)
ax.scatter(emb[y == -1, 0], emb[y == -1, 1], color=COLORS["secondary"], label="class -1", alpha=0.8)
ax.set_title("Kernel PCA embedding separates nonlinear rings")
ax.set_xlabel("Kernel PC 1")
ax.set_ylabel("Kernel PC 2")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed kernel PCA embedding.")
13. Gaussian process regression
A Gaussian process uses the kernel as a covariance function.
Code cell 36
header("13.1 Compute GP posterior mean and variance")
def gp_posterior(X_train, y_train, X_test, gamma=2.0, noise=0.08, jitter=1e-8):
K = rbf_kernel(X_train, gamma=gamma) + (noise**2 + jitter) * np.eye(len(X_train))
Ks = rbf_kernel(X_train, X_test, gamma=gamma)
Kss_diag = np.ones(len(X_test))
L = np.linalg.cholesky(K)
alpha = np.linalg.solve(L.T, np.linalg.solve(L, y_train))
mean = Ks.T @ alpha
v = np.linalg.solve(L, Ks)
var = np.maximum(Kss_diag - np.sum(v*v, axis=0), 0.0)
return mean, var
idx = np.linspace(0, len(x_train)-1, 18).astype(int)
X_gp = x_train[idx]
y_gp = y_train[idx]
gp_mean, gp_var = gp_posterior(X_gp, y_gp, x_test, gamma=2.5, noise=0.12)
print(f"GP posterior mean shape: {gp_mean.shape}")
print(f"GP posterior variance min/max: {gp_var.min():.4f}/{gp_var.max():.4f}")
check_true("GP variances are nonnegative", np.all(gp_var >= -1e-10))
Code cell 37
header("13.2 Visualize GP posterior")
std = np.sqrt(gp_var)
fig, ax = plt.subplots(figsize=(9, 5))
ax.scatter(X_gp[:, 0], y_gp, color=COLORS["neutral"], label="observed data", zorder=3)
ax.plot(x_test[:, 0], gp_mean, color=COLORS["primary"], label="posterior mean")
ax.fill_between(x_test[:, 0], gp_mean - 2*std, gp_mean + 2*std, color=COLORS["primary"], alpha=0.2, label="95% band")
ax.set_title("Gaussian process posterior uncertainty")
ax.set_xlabel("$x$")
ax.set_ylabel("$f(x)$")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed GP posterior mean and uncertainty band.")
14. Kernel matrix cost
Exact kernels are limited by the by Gram matrix.
Code cell 39
header("14.1 Estimate kernel memory cost")
ns = np.array([1_000, 5_000, 10_000, 50_000, 100_000])
gb_float64 = ns**2 * 8 / 1e9
for n, gb in zip(ns, gb_float64):
print(f"n={n:6d}: float64 Gram matrix approx {gb:8.2f} GB")
check_true("memory grows quadratically", gb_float64[-1] / gb_float64[-2] > 3.5)
15. Conditioning
Ridge regularization shifts Gram-matrix eigenvalues and improves numerical stability.
Code cell 41
header("15.1 Ridge shift improves conditioning")
K_ill = rbf_kernel(x_train, gamma=0.02)
eig = psd_eigenvalues(K_ill)
for lam in [0.0, 1e-6, 1e-3, 1e-1]:
shifted = eig + lam
cond = shifted.max() / max(shifted.min(), 1e-15)
print(f"lambda={lam:8g}: min eig={shifted.min():.3e}, condition approx={cond:.3e}")
check_true("ridge shift raises the smallest eigenvalue", (eig + 1e-3).min() > eig.min())
16. Nystrom approximation
Nystrom methods approximate the full Gram matrix using landmark points.
Code cell 43
header("16.1 Nystrom approximation of an RBF Gram matrix")
def nystrom_approximation(X, m=25, gamma=1.5, jitter=1e-6):
idx = np.random.choice(len(X), size=m, replace=False)
landmarks = X[idx]
C = rbf_kernel(X, landmarks, gamma=gamma)
W = rbf_kernel(landmarks, gamma=gamma)
W_inv = np.linalg.pinv(W + jitter * np.eye(m))
return C @ W_inv @ C.T, idx
K_full = rbf_kernel(X, gamma=1.5)
errors = []
ms = [5, 10, 20, 40, 80]
for m in ms:
K_approx, _ = nystrom_approximation(X, m=m, gamma=1.5)
rel = np.linalg.norm(K_full - K_approx, "fro") / np.linalg.norm(K_full, "fro")
errors.append(rel)
print(f"m={m:3d}: relative Frobenius error={rel:.4f}")
check_true("larger landmark budget improves approximation overall", errors[-1] < errors[0])
Code cell 44
header("16.2 Plot Nystrom error curve")
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(ms, errors, marker="o", color=COLORS["primary"])
ax.set_title("Nystrom approximation improves with more landmarks")
ax.set_xlabel("Number of landmarks $m$")
ax.set_ylabel("Relative Frobenius error")
fig.tight_layout()
plt.show()
print("Displayed Nystrom approximation error curve.")
17. Random Fourier features
Random Fourier features approximate shift-invariant kernels with explicit random features.
Code cell 46
header("17.1 Random Fourier feature map")
def rff_features(X, D=200, gamma=1.0, omega=None, b=None):
d = X.shape[1]
if omega is None:
omega = np.random.randn(D, d) * np.sqrt(2 * gamma)
if b is None:
b = np.random.uniform(0, 2*np.pi, size=D)
Z = np.sqrt(2.0 / D) * np.cos(X @ omega.T + b)
return Z, omega, b
Ds = [25, 50, 100, 250, 500]
rff_errors = []
K_target = rbf_kernel(X, gamma=1.0)
for D in Ds:
Z, _, _ = rff_features(X, D=D, gamma=1.0)
K_hat = Z @ Z.T
rel = np.linalg.norm(K_target - K_hat, "fro") / np.linalg.norm(K_target, "fro")
rff_errors.append(rel)
print(f"D={D:4d}: relative kernel approximation error={rel:.4f}")
check_true("RFF approximation gets reasonable by D=500", rff_errors[-1] < 0.45)
Code cell 47
header("17.2 Plot RFF approximation error")
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(Ds, rff_errors, marker="o", color=COLORS["secondary"])
ax.set_title("Random Fourier feature approximation")
ax.set_xlabel("Number of random features $D$")
ax.set_ylabel("Relative Frobenius error")
fig.tight_layout()
plt.show()
print("Displayed RFF approximation error curve.")
18. Random features turn kernels into linear models
Once random features are explicit, ridge regression can be done with a linear design matrix.
Code cell 49
header("18.1 Ridge regression on random Fourier features")
Z_train, omega, b = rff_features(x_train, D=500, gamma=3.0)
Z_test, _, _ = rff_features(x_test, D=500, gamma=3.0, omega=omega, b=b)
lam = 0.05
w_rff = np.linalg.solve(Z_train.T @ Z_train + lam * np.eye(Z_train.shape[1]), Z_train.T @ y_train)
y_rff = Z_test @ w_rff
y_exact = krr_predict(x_train, krr_fit(x_train, y_train, gamma=3.0, lam=lam), x_test, gamma=3.0)
rmse_between = np.sqrt(np.mean((y_rff - y_exact)**2))
print(f"RMSE between exact KRR and RFF ridge predictions: {rmse_between:.4f}")
check_true("RFF ridge tracks exact KRR reasonably", rmse_between < 0.35)
Code cell 50
header("18.2 Visualize exact KRR versus RFF ridge")
fig, ax = plt.subplots(figsize=(9, 5))
ax.scatter(x_train[:, 0], y_train, color=COLORS["neutral"], alpha=0.35, label="data")
ax.plot(x_test[:, 0], y_exact, color=COLORS["primary"], label="exact KRR")
ax.plot(x_test[:, 0], y_rff, color=COLORS["secondary"], linestyle="--", label="RFF ridge")
ax.set_title("Random features approximate kernel ridge regression")
ax.set_xlabel("$x$")
ax.set_ylabel("prediction")
ax.legend()
fig.tight_layout()
plt.show()
print("Displayed exact KRR and RFF ridge comparison.")
19. Similarity, embeddings, and cosine kernels
Normalized dot product is a kernel over normalized vectors.
Code cell 52
header("19.1 Cosine kernel over synthetic embeddings")
E = np.random.randn(12, 5)
E = E / np.linalg.norm(E, axis=1, keepdims=True)
K_cos = E @ E.T
eig_cos = psd_eigenvalues(K_cos)
print("Cosine Gram diagonal:", np.round(np.diag(K_cos), 3))
print("Minimum eigenvalue:", eig_cos.min())
check_true("cosine Gram over normalized embeddings is PSD", eig_cos.min() > -1e-10)
20. Attention and kernel feature maps
Some efficient attention mechanisms approximate exponential dot-product kernels with feature maps.
Code cell 54
header("20.1 Softmax-style exponential kernel is positive on pairs but not automatically normalized")
Q_small = np.random.randn(6, 4)
K_exp = np.exp(Q_small @ Q_small.T / np.sqrt(Q_small.shape[1]))
eig_exp = psd_eigenvalues(K_exp)
print("Exponential dot-product Gram min eigenvalue:", eig_exp.min())
check_true("exp dot-product Gram is PSD on this finite set", eig_exp.min() > -1e-8)
row_softmax = K_exp / K_exp.sum(axis=1, keepdims=True)
print("Row-stochastic attention row sums:", np.round(row_softmax.sum(axis=1), 6))
check_true("row-normalized attention is not symmetric in general", not np.allclose(row_softmax, row_softmax.T))
21. Neural tangent kernel
The empirical NTK is a Gram matrix of parameter gradients. This small finite-difference example avoids autodiff dependencies.
Code cell 56
header("21.1 Empirical NTK by finite differences")
def unpack(theta, width=8):
idx = 0
W1 = theta[idx:idx+width].reshape(width, 1); idx += width
b1 = theta[idx:idx+width]; idx += width
W2 = theta[idx:idx+width].reshape(1, width); idx += width
b2 = theta[idx]
return W1, b1, W2, b2
def tiny_net(theta, X1, width=8):
W1, b1, W2, b2 = unpack(theta, width)
H = np.tanh(X1 @ W1.T + b1)
return (H @ W2.T).ravel() + b2
def finite_difference_jacobian(theta, X1, eps=1e-5):
base = tiny_net(theta, X1)
J = np.zeros((len(X1), len(theta)))
for p in range(len(theta)):
step = np.zeros_like(theta)
step[p] = eps
J[:, p] = (tiny_net(theta + step, X1) - tiny_net(theta - step, X1)) / (2 * eps)
return J
X_ntk = np.linspace(-2, 2, 20)[:, None]
width = 8
theta = 0.5 * np.random.randn(width + width + width + 1)
J = finite_difference_jacobian(theta, X_ntk)
Theta = J @ J.T
eig_ntk = psd_eigenvalues(Theta)
print(f"Jacobian shape: {J.shape}")
print(f"NTK Gram min eigenvalue: {eig_ntk.min():.3e}")
check_true("empirical NTK is PSD up to numerical tolerance", eig_ntk.min() > -1e-8)
Code cell 57
header("21.2 Visualize empirical NTK")
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(Theta, cmap="viridis", aspect="auto")
fig.colorbar(im, ax=ax, label="NTK value")
ax.set_title("Empirical NTK Gram matrix")
ax.set_xlabel("Input index")
ax.set_ylabel("Input index")
fig.tight_layout()
plt.show()
print("Displayed empirical NTK Gram matrix.")
22. End-to-end summary
Kernel methods reduce function-space learning to finite kernel computations. The notebook has now touched the full arc from definitions to scalable approximations and deep-learning theory.
Code cell 59
header("22.1 Final notebook checks")
checks = {
"RBF Gram PSD": psd_eigenvalues(K_rbf).min() > -1e-8,
"KRR prediction finite": np.all(np.isfinite(y_pred)),
"Kernel PCA finite": np.all(np.isfinite(emb)),
"GP variance finite": np.all(np.isfinite(gp_var)),
"RFF error finite": np.all(np.isfinite(rff_errors)),
"NTK PSD": psd_eigenvalues(Theta).min() > -1e-8,
}
for name, ok in checks.items():
check_true(name, ok)
print("Kernel methods theory notebook completed successfully.")