Exercises Notebook

Converted from exercises.ipynb for web reading.

Kernel Methods - Exercises

Eight exercises covering the core arc of notes.md and theory.ipynb. Each exercise has a problem cell, a runnable scaffold, and a complete solution.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 78)
    print(title)
    print("=" * 78)

def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    if not ok:
        raise AssertionError(name)

def check_close(name, actual, expected, tol=1e-8):
    ok = np.allclose(actual, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    if not ok:
        raise AssertionError(name)

def linear_kernel(X, Z=None):
    Z = X if Z is None else Z
    return X @ Z.T

def polynomial_kernel(X, Z=None, degree=2, c=1.0):
    Z = X if Z is None else Z
    return (X @ Z.T + c) ** degree

def rbf_kernel(X, Z=None, gamma=1.0):
    Z = X if Z is None else Z
    X2 = np.sum(X*X, axis=1, keepdims=True)
    Z2 = np.sum(Z*Z, axis=1, keepdims=True).T
    D2 = np.maximum(X2 + Z2 - 2*X @ Z.T, 0.0)
    return np.exp(-gamma * D2)

def center_kernel(K):
    n = K.shape[0]
    H = np.eye(n) - np.ones((n, n)) / n
    return H @ K @ H

def psd_eigenvalues(K):
    return np.linalg.eigvalsh((K + K.T) / 2)

print("Exercise helpers ready.")

Exercise 1 (Easy) - Build Gram matrices

Compute linear, polynomial, and RBF Gram matrices and check symmetry plus PSD eigenvalues.

Code cell 5

# Your Solution - Exercise 1: Build Gram matrices
answer = None
print("answer:", answer)

Code cell 6

# Solution
header("Exercise 1: Build Gram matrices")
X = np.array([[-1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, -1.0]])
K_lin = linear_kernel(X)
K_poly = polynomial_kernel(X, degree=2, c=1.0)
K_rbf = rbf_kernel(X, gamma=0.7)
for name, K in [("linear", K_lin), ("polynomial", K_poly), ("RBF", K_rbf)]:
    check_true(f"{name} symmetric", np.allclose(K, K.T))
    check_true(f"{name} PSD", psd_eigenvalues(K).min() > -1e-10)
    print(name, "eigenvalues:", np.round(psd_eigenvalues(K), 6))
print("\nTakeaway: valid kernels produce symmetric PSD Gram matrices on every finite dataset.")

Exercise 2 (Easy) - Verify a polynomial feature map

Build an explicit feature map for $(x^Tz+1)^2$ in two dimensions.

Code cell 8

# Your Solution - Exercise 2: Verify a polynomial feature map
answer = None
print("answer:", answer)

Code cell 9

# Solution
header("Exercise 2: Verify a polynomial feature map")
def phi_poly2(X):
    x1, x2 = X[:, 0], X[:, 1]
    return np.column_stack([x1**2, np.sqrt(2)*x1*x2, x2**2, np.sqrt(2)*x1, np.sqrt(2)*x2, np.ones(len(X))])

A = np.random.randn(6, 2)
B = np.random.randn(5, 2)
explicit = phi_poly2(A) @ phi_poly2(B).T
implicit = polynomial_kernel(A, B, degree=2, c=1.0)
check_close("explicit feature map matches kernel", explicit, implicit, tol=1e-10)
print("Feature dimension:", phi_poly2(A).shape[1])
print("\nTakeaway: polynomial kernels hide a concrete finite feature expansion.")

Exercise 3 (Easy) - Normalize and center a kernel

Normalize an RBF Gram matrix and center it with $H=I-11^T/n$ .

Code cell 11

# Your Solution - Exercise 3: Normalize and center a kernel
answer = None
print("answer:", answer)

Code cell 12

# Solution
header("Exercise 3: Normalize and center a kernel")
X = np.random.randn(20, 3)
K = rbf_kernel(X, gamma=0.5)
d = np.sqrt(np.diag(K))
K_norm = K / (d[:, None] * d[None, :])
K_centered = center_kernel(K_norm)
check_close("normalized diagonal equals one", np.diag(K_norm), np.ones(len(X)), tol=1e-10)
check_true("centered row sums near zero", np.linalg.norm(K_centered.sum(axis=1)) < 1e-9)
check_true("centered column sums near zero", np.linalg.norm(K_centered.sum(axis=0)) < 1e-9)
print("\nTakeaway: normalization controls feature norms; centering removes the feature-space mean.")

Exercise 4 (Medium) - Kernel ridge regression

Fit noisy nonlinear data using exact RBF kernel ridge regression.

Code cell 14

# Your Solution - Exercise 4: Kernel ridge regression
answer = None
print("answer:", answer)

Code cell 15

# Solution
header("Exercise 4: Kernel ridge regression")
def krr_fit(X_train, y_train, gamma=2.0, lam=0.05):
    K = rbf_kernel(X_train, gamma=gamma)
    return np.linalg.solve(K + lam*np.eye(len(X_train)), y_train)

def krr_predict(X_train, alpha, X_test, gamma=2.0):
    return rbf_kernel(X_test, X_train, gamma=gamma) @ alpha

X_train = np.linspace(-3, 3, 40)[:, None]
y_true = np.cos(1.7 * X_train[:, 0])
y_train = y_true + 0.12*np.random.randn(len(X_train))
alpha = krr_fit(X_train, y_train, gamma=1.8, lam=0.05)
pred = krr_predict(X_train, alpha, X_train, gamma=1.8)
rmse = np.sqrt(np.mean((pred - y_true)**2))
print("Training-grid RMSE against noiseless function:", rmse)
check_true("KRR fits the nonlinear signal", rmse < 0.25)
print("\nTakeaway: KRR solves nonlinear regression through one regularized linear system.")

Exercise 5 (Medium) - Kernel PCA

Center an RBF kernel and compute the first two kernel principal coordinates.

Code cell 17

# Your Solution - Exercise 5: Kernel PCA
answer = None
print("answer:", answer)

Code cell 18

# Solution
header("Exercise 5: Kernel PCA")
theta = np.linspace(0, 2*np.pi, 80, endpoint=False)
inner = np.column_stack([np.cos(theta), np.sin(theta)]) * 0.7
outer = np.column_stack([np.cos(theta), np.sin(theta)]) * 1.5
X = np.vstack([inner, outer]) + 0.03*np.random.randn(160, 2)
Kc = center_kernel(rbf_kernel(X, gamma=2.0))
eigvals, eigvecs = np.linalg.eigh((Kc + Kc.T)/2)
order = np.argsort(eigvals)[::-1]
coords = eigvecs[:, order[:2]] * np.sqrt(np.maximum(eigvals[order[:2]], 0.0))
check_true("kernel PCA coordinates are finite", np.all(np.isfinite(coords)))
check_true("top kernel eigenvalue positive", eigvals[order[0]] > 0)
print("Coordinate shape:", coords.shape)
print("\nTakeaway: kernel PCA is ordinary spectral analysis applied to a centered feature-space Gram matrix.")

Exercise 6 (Medium) - Gaussian process posterior

Compute posterior mean and variance for one-dimensional GP regression.

Code cell 20

# Your Solution - Exercise 6: Gaussian process posterior
answer = None
print("answer:", answer)

Code cell 21

# Solution
header("Exercise 6: Gaussian process posterior")
def gp_posterior(X_train, y_train, X_test, gamma=2.0, noise=0.1):
    K = rbf_kernel(X_train, gamma=gamma) + noise**2*np.eye(len(X_train))
    Ks = rbf_kernel(X_train, X_test, gamma=gamma)
    L = np.linalg.cholesky(K + 1e-8*np.eye(len(X_train)))
    alpha = np.linalg.solve(L.T, np.linalg.solve(L, y_train))
    mean = Ks.T @ alpha
    v = np.linalg.solve(L, Ks)
    var = np.maximum(1.0 - np.sum(v*v, axis=0), 0.0)
    return mean, var

X_train = np.linspace(-2.5, 2.5, 12)[:, None]
y_train = np.sin(2*X_train[:, 0]) + 0.1*np.random.randn(len(X_train))
X_test = np.linspace(-3, 3, 100)[:, None]
mean, var = gp_posterior(X_train, y_train, X_test)
check_true("posterior mean finite", np.all(np.isfinite(mean)))
check_true("posterior variance nonnegative", np.all(var >= -1e-10))
print("Variance range:", float(var.min()), float(var.max()))
print("\nTakeaway: a GP kernel is a covariance function that yields both prediction and uncertainty.")

Exercise 7 (Hard) - Random Fourier features

Approximate the RBF kernel and measure error as feature count increases.

Code cell 23

# Your Solution - Exercise 7: Random Fourier features
answer = None
print("answer:", answer)

Code cell 24

# Solution
header("Exercise 7: Random Fourier features")
def rff_features(X, D, gamma=1.0):
    omega = np.random.randn(D, X.shape[1]) * np.sqrt(2*gamma)
    b = np.random.uniform(0, 2*np.pi, D)
    return np.sqrt(2.0/D) * np.cos(X @ omega.T + b)

X = np.random.randn(60, 2)
K = rbf_kernel(X, gamma=0.8)
errors = []
for D in [25, 50, 100, 250, 500]:
    Z = rff_features(X, D, gamma=0.8)
    err = np.linalg.norm(K - Z@Z.T, "fro") / np.linalg.norm(K, "fro")
    errors.append(err)
    print(D, err)
check_true("RFF approximation is finite", np.all(np.isfinite(errors)))
check_true("largest feature budget is reasonably accurate", errors[-1] < 0.55)
print("\nTakeaway: random features trade exact kernel algebra for scalable approximate linear features.")

Exercise 8 (Hard) - Empirical neural tangent kernel

Compute a finite-difference NTK and verify PSD structure.

Code cell 26

# Your Solution - Exercise 8: Empirical neural tangent kernel
answer = None
print("answer:", answer)

Code cell 27

# Solution
header("Exercise 8: Empirical neural tangent kernel")
def unpack(theta, width=6):
    idx = 0
    W1 = theta[idx:idx+width].reshape(width, 1); idx += width
    b1 = theta[idx:idx+width]; idx += width
    W2 = theta[idx:idx+width].reshape(1, width); idx += width
    b2 = theta[idx]
    return W1, b1, W2, b2

def net(theta, X, width=6):
    W1, b1, W2, b2 = unpack(theta, width)
    H = np.tanh(X @ W1.T + b1)
    return (H @ W2.T).ravel() + b2

def fd_jac(theta, X, eps=1e-5):
    J = np.zeros((len(X), len(theta)))
    for p in range(len(theta)):
        step = np.zeros_like(theta)
        step[p] = eps
        J[:, p] = (net(theta + step, X) - net(theta - step, X)) / (2*eps)
    return J

X = np.linspace(-2, 2, 16)[:, None]
width = 6
theta = 0.4*np.random.randn(width + width + width + 1)
J = fd_jac(theta, X)
Theta = J @ J.T
check_true("NTK matrix symmetric", np.allclose(Theta, Theta.T, atol=1e-9))
check_true("NTK matrix PSD", psd_eigenvalues(Theta).min() > -1e-8)
print("NTK shape:", Theta.shape)
print("\nTakeaway: the NTK is a kernel because it is a Gram matrix of parameter-gradient features.")

Exercise 9: Nystrom Kernel Approximation

Approximate an RBF Gram matrix from a small landmark set and compare the approximation error with the full matrix.

Code cell 29

# Your Solution
X = np.linspace(-2, 2, 12)[:, None]
landmarks = X[[0, 5, 11]]
print("Compute a Nystrom approximation for the RBF kernel.")

Code cell 30

# Solution
header("Exercise 9: Nystrom Approximation")
X = np.linspace(-2, 2, 12)[:, None]
landmarks = X[[0, 5, 11]]
K = rbf_kernel(X, gamma=0.7)
C = rbf_kernel(X, landmarks, gamma=0.7)
W = rbf_kernel(landmarks, gamma=0.7)
K_hat = C @ np.linalg.pinv(W) @ C.T
rel_error = np.linalg.norm(K - K_hat, "fro") / np.linalg.norm(K, "fro")
print("relative Frobenius error:", round(float(rel_error), 6))
check_true("approximation error is finite", np.isfinite(rel_error))
print("Takeaway: Nystrom trades exact kernel geometry for lower-rank computation.")

Exercise 10: Centered Kernel Alignment

Compare centered alignment between labels and two kernels. Interpret which kernel better matches the target geometry.

Code cell 32

# Your Solution
X = np.array([[-1.5], [-0.5], [0.5], [1.5]])
y = np.array([0.0, 1.0, 1.0, 0.0])
print("Compute centered kernel alignment.")

Code cell 33

# Solution
header("Exercise 10: Centered Kernel Alignment")
X = np.array([[-1.5], [-0.5], [0.5], [1.5]])
y = np.array([0.0, 1.0, 1.0, 0.0])
Y = np.outer(y - y.mean(), y - y.mean())
def alignment(K, Y):
    Kc = center_kernel(K)
    Yc = center_kernel(Y)
    return float(np.sum(Kc * Yc) / (np.linalg.norm(Kc, "fro") * np.linalg.norm(Yc, "fro")))
lin = alignment(linear_kernel(X), Y)
rbf = alignment(rbf_kernel(X, gamma=1.0), Y)
print("linear alignment:", round(lin, 6))
print("rbf alignment:", round(rbf, 6))
check_true("RBF captures the nonmonotone labels better", rbf > lin)
print("Takeaway: alignment is a diagnostic for whether kernel geometry matches the prediction task.")