Exercises Notebook

Converted from exercises.ipynb for web reading.

Jacobians and Hessians - Exercises

10 graded exercises covering the full section arc, from multivariate calculus mechanics to ML-facing gradient systems.

Format	Description
Problem	Markdown cell with task description
Your Solution	Code cell for learner work
Solution	Reference solution with checks

Difficulty: straightforward -> moderate -> challenging.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize

try:
    import torch
    HAS_TORCH = True
except ImportError:
    torch = None
    HAS_TORCH = False


def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok

def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

def sigmoid(x):
    x = np.asarray(x, dtype=float)
    return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z, axis=axis, keepdims=True)
    e = np.exp(z)
    return e / np.sum(e, axis=axis, keepdims=True)

def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return np.where(np.asarray(x) > 0, 1.0, 0.0)

def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def numerical_gradient(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        xp = x.copy(); xm = x.copy()
        xp[idx] += h; xm[idx] -= h
        grad[idx] = (f(xp) - f(xm)) / (2*h)
    return grad

def numerical_jacobian(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
    return J.reshape(y0.shape + x.shape)

def grad_check(f, x, analytic_grad, h=1e-6):
    numeric_grad = numerical_gradient(f, x, h=h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def jacobian_fd(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
        ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
        J[:, j] = (yp - ym) / (2*h)
    return J.reshape(y0.shape + x.shape)

def hessian_fd(f, x, h=1e-5):
    x = np.asarray(x, dtype=float)
    H = np.zeros((x.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
        gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
        H[:, j] = (gp - gm) / (2*h)
    return H.reshape(x.shape + x.shape)



def grad_fd(f, x, h=1e-6):
    return numerical_gradient(f, x, h=h)



def fd_grad(f, x, h=1e-6):
    return numerical_gradient(f, np.asarray(x, dtype=float), h=h)

print("Chapter helper setup complete.")

Exercise 1 (★): Jacobian Computation and JVP/VJP

Let $f: \mathbb{R}^3 \to \mathbb{R}^2$ be $f(x_1, x_2, x_3) = (x_1^2 + x_2 x_3,\; e^{x_1} - x_3^2)$ .

(a) Compute $J_f(\mathbf{x})$ analytically for general $\mathbf{x}$ .

(b) Evaluate $J_f$ at $\mathbf{x}_0 = (0, 1, 1)^\top$ and verify against finite differences.

(c) Compute the JVP $J_f \mathbf{v}$ at $\mathbf{x}_0$ for $\mathbf{v} = (1, 0, -1)^\top$ .

(d) Compute the VJP $J_f^\top \mathbf{u}$ at $\mathbf{x}_0$ for $\mathbf{u} = (1, 2)^\top$ .

(e) Verify the duality: $\mathbf{u}^\top(J_f\mathbf{v}) = (J_f^\top\mathbf{u})^\top\mathbf{v}$ .

Code cell 5

# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")

Code cell 6

# Solution
# Exercise 1 - reference solution

def f_ex1(x):
    return np.array([x[0]**2 + x[1]*x[2],
                     np.exp(x[0]) - x[2]**2])

def jacobian_ex1(x):
    """Analytical Jacobian of f_ex1."""
    return np.array([[2*x[0],   x[2],    x[1]],
                     [np.exp(x[0]), 0.0, -2*x[2]]])

x0 = np.array([0.0, 1.0, 1.0])
v  = np.array([1.0, 0.0, -1.0])
u  = np.array([1.0, 2.0])

J = jacobian_ex1(x0)
J_fd = jacobian_fd(f_ex1, x0)

header('Exercise 1: Jacobian Computation')
print('Jacobian (analytical):')
print(J)

check_close('Jacobian matches FD', J, J_fd, tol=1e-7)

# (c) JVP
jvp = J @ v
print(f'\nJVP J@v = {jvp}')
jvp_expected = np.array([2*0 + 1*(-1), np.exp(0)*1 - 2*1*(-1)])
# = [0 + 0 - 1, 1 + 2] = [-1, 3]
jvp_expected = np.array([-1.0, 3.0])
check_close('JVP correct', jvp, jvp_expected)

# (d) VJP
vjp = J.T @ u
print(f'VJP J^T@u = {vjp}')
# = [2*0*1 + exp(0)*2, 1*1 + 0*2, 1*1 + (-2)*2] = [2, 1, -3]
vjp_expected = np.array([2.0, 1.0, -3.0])
check_close('VJP correct', vjp, vjp_expected)

# (e) Duality
lhs = u @ (J @ v)   # u^T (Jv)
rhs = vjp @ v       # (J^T u)^T v
check_close('Duality u^T(Jv) = (J^Tu)^Tv', lhs, rhs)

print('\nTakeaway: JVP and VJP are transposes — backprop is repeated VJP, costing O(1) passes for scalar loss.')

print("Exercise 1 solution complete.")

Exercise 2 (★): Softmax Jacobian Properties

Let $\sigma: \mathbb{R}^K \to \mathbb{R}^K$ be softmax, $\mathbf{p} = \sigma(\mathbf{z})$ .

(a) For $\mathbf{z} = (1, 2, 0)^\top$ , compute $\mathbf{p}$ and $J_\sigma = \text{diag}(\mathbf{p}) - \mathbf{p}\mathbf{p}^\top$ .

(b) Verify $J_\sigma \mathbf{1} = \mathbf{0}$ and that $J_\sigma$ is symmetric.

(c) Compute the eigenvalues of $J_\sigma$ and confirm rank $= K-1$ .

(d) Derive the cross-entropy gradient: for $\mathcal{L} = -\log p_y$ , show $\partial\mathcal{L}/\partial\mathbf{z} = \mathbf{p} - \mathbf{e}_y$ .

(e) Verify part (d) numerically for $y = 1$ (second class).

Code cell 8

# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")

Code cell 9

# Solution
# Exercise 2 - reference solution

z = np.array([1.0, 2.0, 0.0])
K = len(z)

def softmax_jacobian(z):
    p = softmax(z)
    return np.diag(p) - np.outer(p, p)

p = softmax(z)
J = softmax_jacobian(z)

header('Exercise 2: Softmax Jacobian')
print(f'p = {p.round(6)}')
print('J_sigma:')
print(J.round(6))

# (b)
ones = np.ones(K)
check_close('J @ 1 = 0 (null space)', J @ ones, np.zeros(K), tol=1e-10)
check_true('J is symmetric', np.allclose(J, J.T))

# (c)
eigvals = np.sort(la.eigvalsh(J))[::-1]
print(f'Eigenvalues: {eigvals.round(6)}')
rank_J = la.matrix_rank(J)
check_true(f'Rank = K-1 = {K-1}', rank_J == K-1)
check_true('All eigenvalues >= 0 (PSD)', np.all(eigvals >= -1e-10))

# (d)+(e): Cross-entropy gradient
y_class = 1  # index of true class
e_y = np.zeros(K); e_y[y_class] = 1.0

# Analytical: grad_z L = p - e_y
ce_grad_ana = p - e_y

# Numerical: L = -log(p_y) = -log(softmax(z)[y])
def ce_loss(z, y=y_class):
    return -np.log(softmax(z)[y])

ce_grad_fd = jacobian_fd(lambda z: np.array([ce_loss(z)]), z)[0]

print(f'\nCE gradient (analytical p-e_y): {ce_grad_ana.round(6)}')
print(f'CE gradient (FD):               {ce_grad_fd.round(6)}')
check_close('CE gradient = p - e_y', ce_grad_ana, ce_grad_fd, tol=1e-6)

print('\nTakeaway: The clean gradient p-e_y comes from the softmax Jacobian structure.')

print("Exercise 2 solution complete.")

Exercise 3 (★): Hessian and Critical Point Classification

Let $f(x, y) = x^3 + 3xy - y^3$ .

(a) Solve $\nabla f = 0$ to find all critical points.

(b) Compute $H_f$ at each critical point.

(c) Classify each point (local min, max, or saddle) using eigenvalues.

(d) For the saddle point, plot the contour and the two eigenvector directions showing curvature up and curvature down.

Code cell 11

# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")

Code cell 12

# Solution
# Exercise 3 - reference solution

def f_ex3(x):
    return x[0]**3 + 3*x[0]*x[1] - x[1]**3

# Critical points:
# df/dx = 3x^2 + 3y = 0  =>  y = -x^2
# df/dy = 3x - 3y^2 = 0  =>  x = y^2 = x^4  =>  x(x^3-1)=0
# => x=0, y=0  OR  x=1, y=-1
critical_points = [np.array([0., 0.]), np.array([1., -1.])]

header('Exercise 3: Critical Point Classification')
for cp in critical_points:
    H = hessian_fd(f_ex3, cp)
    eigvals = np.sort(la.eigvalsh(H))
    grad_at_cp = np.array([3*cp[0]**2 + 3*cp[1], 3*cp[0] - 3*cp[1]**2])

    if np.all(eigvals > 0):
        ctype = 'LOCAL MINIMUM'
    elif np.all(eigvals < 0):
        ctype = 'LOCAL MAXIMUM'
    else:
        ctype = 'SADDLE POINT'

    print(f'\nPoint {cp}:')
    print(f'  grad = {grad_at_cp.round(8)}')
    print(f'  eigenvalues = {eigvals.round(6)}')
    print(f'  Type: {ctype}')

    check_true(f'gradient ~ 0 at {cp}', la.norm(grad_at_cp) < 1e-8)

# (d) Visualize saddle
if HAS_MPL:
    cp_sad = np.array([1., -1.])
    H_sad = hessian_fd(f_ex3, cp_sad)
    evals, evecs = la.eigh(H_sad)

    xx, yy = np.meshgrid(np.linspace(0.2, 1.8, 60), np.linspace(-1.8, -0.2, 60))
    zz = xx**3 + 3*xx*yy - yy**3

    fig, ax = plt.subplots(figsize=(8, 7))
    cf = ax.contourf(xx, yy, zz, levels=25, cmap='RdBu_r', alpha=0.8)
    plt.colorbar(cf, ax=ax)
    ax.contour(xx, yy, zz, levels=25, colors='white', linewidths=0.4, alpha=0.4)
    ax.plot(*cp_sad, 'ko', ms=12, label='Saddle (1,-1)')

    scale = 0.35
    for ev, vec in zip(evals, evecs.T):
        c = 'lime' if ev > 0 else 'red'
        lab = f'λ={ev:.2f} ({"↑" if ev>0 else "↓"}curv)'
        ax.annotate('', xy=cp_sad+scale*vec, xytext=cp_sad,
                    arrowprops=dict(arrowstyle='->', color=c, lw=2.5))

    ax.set_xlabel('x'); ax.set_ylabel('y')
    ax.set_title('f(x,y) = x³+3xy-y³: Saddle at (1,-1)')
    plt.tight_layout(); plt.show()
    print('Figure: green=curvature up (min direction), red=curvature down (saddle direction)')

print('\nTakeaway: Hessian eigenvalues classify critical points geometrically.')

print("Exercise 3 solution complete.")

Exercise 4 (★★): LayerNorm Jacobian

LayerNorm: $\text{LN}(\mathbf{x}) = (\mathbf{x} - \bar{x}\mathbf{1})/\sigma$

(a) Derive and implement $J_{\text{LN}} = \frac{1}{\sigma}(I - \frac{1}{d}\mathbf{1}\mathbf{1}^\top - \frac{1}{d}\hat{\mathbf{x}}\hat{\mathbf{x}}^\top)$ .

(b) Verify against finite differences for $\mathbf{x} = (1, 2, 3, 4)^\top$ .

(c) Show $\mathbf{1}$ and $\hat{\mathbf{x}}$ are in the null space.

(d) Compute the rank of $J_{\text{LN}}$ and explain what it means geometrically.

Code cell 14

# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")

Code cell 15

# Solution
# Exercise 4 - reference solution

def layernorm(x):
    mu = x.mean()
    sigma = np.sqrt(((x-mu)**2).mean())
    return (x - mu) / sigma

def layernorm_jacobian(x):
    d = len(x)
    mu = x.mean()
    sigma = np.sqrt(((x-mu)**2).mean())
    x_hat = (x - mu) / sigma
    I = np.eye(d)
    return (1/sigma) * (I - np.ones((d,d))/d - np.outer(x_hat, x_hat)/d)

x = np.array([1.0, 2.0, 3.0, 4.0])
d = len(x)

J_ln = layernorm_jacobian(x)
J_ln_fd = jacobian_fd(layernorm, x)

header('Exercise 4: LayerNorm Jacobian')
print('J_LN (analytical):')
print(J_ln.round(6))

check_close('J_LN matches FD', J_ln, J_ln_fd, tol=1e-6)

# (c) Null space
ones_vec = np.ones(d)
mu = x.mean(); sigma = np.sqrt(((x-mu)**2).mean())
x_hat = (x - mu) / sigma

check_close('J_LN @ 1 = 0', J_ln @ ones_vec, np.zeros(d), tol=1e-10)
check_close('J_LN @ x_hat = 0', J_ln @ x_hat, np.zeros(d), tol=1e-10)

# (d) Rank
rank_ln = la.matrix_rank(J_ln)
check_true(f'rank = d-2 = {d-2}', rank_ln == d-2)

print(f'\nRank = {rank_ln}: LayerNorm cannot distinguish inputs that differ')
print('by adding a constant (direction 1) or scaling (direction x_hat).')
print('\nTakeaway: LayerNorm removes mean and variance information — its Jacobian has rank d-2.')

print("Exercise 4 solution complete.")

Exercise 5 (★★): HVPs and Edge of Stability

MSE loss: $f(\mathbf{w}) = \frac{1}{2m}\|X\mathbf{w} - \mathbf{y}\|^2$ , Hessian $H = \frac{1}{m}X^\top X$ .

(a) Implement the HVP $H\mathbf{v} = \frac{1}{m}X^\top(X\mathbf{v})$ (matrix-free).

(b) Use power iteration (10 steps) to estimate $\lambda_{\max}(H)$ .

(c) Compute the max stable learning rate $\eta_{\max} = 2/\lambda_{\max}$ .

(d) Run GD with $\eta = 0.9\eta_{\max}$ (converges) and $\eta = 1.1\eta_{\max}$ (diverges).

(e) Plot both loss curves and explain the edge of stability.

Code cell 17

# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")

Code cell 18

# Solution
# Exercise 5 - reference solution

np.random.seed(42)
m_data, n_feat = 40, 8
X = np.random.randn(m_data, n_feat)
true_w = np.random.randn(n_feat)
y = X @ true_w + 0.05*np.random.randn(m_data)

def mse_loss(w):
    return 0.5 * np.mean((X @ w - y)**2)

def mse_grad(w):
    return X.T @ (X @ w - y) / m_data

def hvp_mse(v):
    """HVP for MSE: H@v = X^T(Xv)/m."""
    return X.T @ (X @ v) / m_data

def power_iter(hvp_fn, n, n_iters=20, seed=1):
    np.random.seed(seed)
    v = np.random.randn(n)
    v = v / la.norm(v)
    lam = 0.0
    for _ in range(n_iters):
        hv = hvp_fn(v)
        lam = v @ hv
        v = hv / la.norm(hv)
    return lam

lam_max = power_iter(hvp_mse, n_feat)
lam_max_true = la.eigvalsh(X.T @ X / m_data)[-1]
eta_max = 2.0 / lam_max

header('Exercise 5: HVPs and Edge of Stability')
print(f'lambda_max (power iter): {lam_max:.6f}')
print(f'lambda_max (exact):      {lam_max_true:.6f}')
check_close('Power iter matches exact', lam_max, lam_max_true, tol=0.01)

print(f'Max stable lr: eta_max = {eta_max:.6f}')

# (d) GD stable vs unstable
n_steps = 200
losses_stable, losses_unstable = [], []
w_stable = w_unstable = np.zeros(n_feat)

for _ in range(n_steps):
    w_stable   = w_stable   - 0.9*eta_max * mse_grad(w_stable)
    w_unstable = w_unstable - 1.1*eta_max * mse_grad(w_unstable)
    losses_stable.append(mse_loss(w_stable))
    losses_unstable.append(min(mse_loss(w_unstable), 1e8))

check_true('Stable GD converges', losses_stable[-1] < 0.01)
check_true('Unstable GD diverges', losses_unstable[-1] > 1e4)

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(losses_stable, 'g-', lw=2,
                label=f'η=0.9×η_max (converges)')
    ax.semilogy(losses_unstable, 'r-', lw=2,
                label=f'η=1.1×η_max (diverges)')
    ax.set_xlabel('Steps'); ax.set_ylabel('Loss (log)')
    ax.set_title(f'Edge of Stability: η_max = {eta_max:.4f}')
    ax.legend(); plt.tight_layout(); plt.show()

print('\nTakeaway: lambda_max determines the maximum stable learning rate. '
      'Power iteration (20 HVPs) estimates it without forming H.')

print("Exercise 5 solution complete.")

Exercise 6 (★★): Newton's Method Convergence

Minimise $g(x) = \log(1+e^x) - 2x$ (minimum at $x^* = \log 3$ ).

(a) Compute $g'(x) = \sigma(x) - 2$ and $g''(x) = \sigma(x)(1-\sigma(x))$ .

(b) Run 10 steps of Newton's method from $x_0 = 5$ .

(c) Run 100 steps of gradient descent with $\eta = 0.5$ from $x_0 = 5$ .

(d) Plot both error sequences on a log scale and verify Newton's quadratic convergence.

(e) Compute the Newton decrement $\lambda_N = \sqrt{(g'(x))^2/g''(x)}$ at each step.

Code cell 20

# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")

Code cell 21

# Solution
# Exercise 6 - reference solution

def sigmoid(x): return 1/(1+np.exp(-x))
def g(x): return np.log(1+np.exp(x)) - 2*x
def g_prime(x): return sigmoid(x) - 2
def g_double_prime(x): s=sigmoid(x); return s*(1-s)

x_star = np.log(3.0)
x0 = 5.0

# Newton's method
x = x0
errors_newton = [abs(x - x_star)]
decrements_newton = []
for _ in range(10):
    decrement = np.sqrt(g_prime(x)**2 / g_double_prime(x))
    decrements_newton.append(decrement)
    x = x - g_prime(x) / g_double_prime(x)  # Newton step
    errors_newton.append(abs(x - x_star))

# Gradient descent
x_gd = x0
errors_gd = [abs(x_gd - x_star)]
eta_gd = 0.5
for _ in range(100):
    x_gd = x_gd - eta_gd * g_prime(x_gd)
    errors_gd.append(abs(x_gd - x_star))

header('Exercise 6: Newton vs GD Convergence')
print('Newton errors:')
for i, e in enumerate(errors_newton[:8]):
    print(f'  step {i}: {e:.3e}')

# Quadratic convergence check: log|e_{t+1}| ≈ 2 * log|e_t| near optimum
valid = [(errors_newton[i], errors_newton[i+1])
         for i in range(len(errors_newton)-1)
         if errors_newton[i] > 1e-12 and errors_newton[i+1] > 1e-12]

if len(valid) >= 2:
    e_t, e_t1 = valid[-2]
    ratio = np.log(e_t1+1e-20) / (np.log(e_t+1e-20) + 1e-20)
    print(f'\nLog-error ratio (should be ~2 for quadratic convergence): {ratio:.2f}')
    check_true('Quadratic convergence (ratio > 1.5)', ratio > 1.5)

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(errors_newton, 'b-o', ms=6, lw=2, label='Newton (10 steps)')
    ax.semilogy(errors_gd[:20], 'r-', lw=2, label='GD η=0.5 (first 20 steps)')
    ax.set_xlabel('Step'); ax.set_ylabel('|x_t - x*|')
    ax.set_title('Newton (quadratic) vs GD (linear) Convergence')
    ax.legend(); plt.tight_layout(); plt.show()

print('\nTakeaway: Newton converges in ~5 steps (quadratic). GD needs hundreds (linear).')

print("Exercise 6 solution complete.")

Exercise 7 (★★★): K-FAC Kronecker Factorisation

Linear layer $\mathbf{y} = W\mathbf{x}$ , $W \in \mathbb{R}^{n_{\text{out}}\times n_{\text{in}}}$ .

(a) Show that $\text{vec}(\nabla_W\mathcal{L}) = \mathbf{x} \otimes \mathbf{g}$ where $\mathbf{g} = \partial\mathcal{L}/\partial\mathbf{y}$ .

(b) Compute the exact FIM block from 100 samples and the K-FAC approximation $A \otimes G$ .

(c) Verify $(A\otimes G)^{-1} = A^{-1}\otimes G^{-1}$ .

(d) Compute the K-FAC natural gradient step $\Delta W = -G^{-1}(\nabla_W)A^{-1}$ and verify it matches $(A^{-1}\otimes G^{-1})\text{vec}(\nabla_W)$ .

(e) Compare the computational cost: direct FIM inversion vs K-FAC inversion.

Code cell 23

# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")

Code cell 24

# Solution
# Exercise 7 - reference solution

n_in_k, n_out_k = 5, 4
n_samp_k = 150
np.random.seed(42)

activations_k = np.random.randn(n_samp_k, n_in_k)
back_grads_k  = np.random.randn(n_samp_k, n_out_k) * 0.3

# (a) Verify vec(∇_W L) = x ⊗ g for one sample
a_i = activations_k[0]
g_i = back_grads_k[0]
grad_W_i = np.outer(g_i, a_i)  # (n_out, n_in)
kron_ag = np.kron(a_i, g_i)    # (n_in * n_out,)
vec_grad_W_i = grad_W_i.T.flatten()  # column-major vec

header('Exercise 7: K-FAC')
print('(a) Verify vec(∇_W) = x ⊗ g:')
check_close('kron(a,g) = vec(∇_W)', kron_ag, vec_grad_W_i)

# (b) Exact FIM block
dim_fim = n_in_k * n_out_k
fim_exact_k = np.zeros((dim_fim, dim_fim))
for i in range(n_samp_k):
    a = activations_k[i]; g = back_grads_k[i]
    v = np.kron(a, g)
    fim_exact_k += np.outer(v, v)
fim_exact_k /= n_samp_k

# K-FAC approximation
A_k = activations_k.T @ activations_k / n_samp_k
G_k = back_grads_k.T  @ back_grads_k  / n_samp_k
fim_kfac_k = np.kron(A_k, G_k)

rel_err_k = la.norm(fim_exact_k - fim_kfac_k) / la.norm(fim_exact_k)
print(f'\n(b) FIM shape: {fim_exact_k.shape}')
print(f'K-FAC relative error: {rel_err_k:.4f}')

# (c) (A⊗G)^{-1} = A^{-1} ⊗ G^{-1}
dam = 1e-3
A_inv_k = la.inv(A_k + dam*np.eye(n_in_k))
G_inv_k = la.inv(G_k + dam*np.eye(n_out_k))
kfac_inv_k = np.kron(A_inv_k, G_inv_k)

fim_kfac_damped = fim_kfac_k + dam*np.eye(dim_fim)
direct_inv_k = la.inv(fim_kfac_damped)

check_close('(c) (A⊗G)^-1 = A^-1⊗G^-1', kfac_inv_k, direct_inv_k, tol=1e-5)

# (d) K-FAC natural gradient step
np.random.seed(7)
grad_W_test = np.random.randn(n_out_k, n_in_k)

# Matrix form
delta_W_matrix = G_inv_k @ grad_W_test @ A_inv_k

# Vec form: (A^-1 ⊗ G^-1) vec(∇W)
vec_grad = grad_W_test.T.flatten()
delta_W_vec = (kfac_inv_k @ vec_grad).reshape(n_in_k, n_out_k).T

check_close('(d) matrix form = vec form', delta_W_matrix, delta_W_vec, tol=1e-8)

# (e) Cost comparison
print(f'\n(e) Computational cost comparison:')
print(f'  Direct FIM inversion: O((n_in*n_out)^3) = O({(n_in_k*n_out_k)**3})')
print(f'  K-FAC inversion: O(n_in^3 + n_out^3) = O({n_in_k**3 + n_out_k**3})')
speedup = (n_in_k*n_out_k)**3 / (n_in_k**3 + n_out_k**3)
print(f'  Speedup factor: {speedup:.1f}x (grows as min(m,n)^2)')

print('\nTakeaway: K-FAC uses Kronecker structure to reduce inversion from O(n^6) to O(n^3).')

print("Exercise 7 solution complete.")

Exercise 8 (★★★): Jacobian Determinant in Normalising Flows

RealNVP coupling layer: $\mathbf{y}_1 = \mathbf{x}_1$ , $\mathbf{y}_2 = \mathbf{x}_2 \odot e^{s(\mathbf{x}_1)} + t(\mathbf{x}_1)$ .

(a) Show the Jacobian is block lower-triangular.

(b) Show $\log|\det J_f| = \sum_i s_i(\mathbf{x}_1)$ .

(c) Verify numerically for a specific $\mathbf{x}$ .

(d) Implement the inverse and verify $f^{-1}(f(\mathbf{x})) = \mathbf{x}$ .

(e) Explain why triangular Jacobians are essential for scalable generative modelling.

Code cell 26

# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")

Code cell 27

# Solution
# Exercise 8 - reference solution

W_s = np.array([[0.5, -0.2], [0.3, 0.1]])
W_t = np.array([[0.1, 0.0], [0.0, -0.2]])

def s_net(x1): return W_s @ x1
def t_net(x1): return W_t @ x1

def coupling_fwd(x):
    x1, x2 = x[:2], x[2:]
    s, t = s_net(x1), t_net(x1)
    return np.concatenate([x1, x2 * np.exp(s) + t])

def coupling_inv(y):
    y1, y2 = y[:2], y[2:]
    x1 = y1.copy()
    s, t = s_net(x1), t_net(x1)
    x2 = (y2 - t) * np.exp(-s)
    return np.concatenate([x1, x2])

x_test = np.array([1.0, 0.5, -0.3, 2.0])

header('Exercise 8: Normalising Flow Jacobian')

# (b)+(c) Log-det verification
y_test = coupling_fwd(x_test)
J_coupling = jacobian_fd(coupling_fwd, x_test)

print('(a)+(b) Jacobian (should be lower-triangular):')
print(J_coupling.round(4))

upper_right = J_coupling[:2, 2:]
check_close('Upper-right block = 0 (triangular)', upper_right, np.zeros((2,2)), tol=1e-8)

log_det_fd = np.log(np.abs(la.det(J_coupling)))
s_at_x1 = s_net(x_test[:2])
log_det_formula = np.sum(s_at_x1)

print(f'\n(c) log|det J| (direct from J): {log_det_fd:.6f}')
print(f'    log|det J| (formula sum(s)):  {log_det_formula:.6f}')
check_close('log-det formula correct', log_det_fd, log_det_formula, tol=1e-6)

# (d) Invertibility
x_rec = coupling_inv(y_test)
check_close('(d) f^-1(f(x)) = x', x_rec, x_test, tol=1e-10)

# (e) Scalability argument
print('\n(e) Why triangular Jacobian matters:')
print('  For d=1000: direct det(J) = O(d^3) = O(10^9) operations')
print('  Triangular: det = product of diagonals = O(d) operations')
print('  Training NF via MLE: needs log|det J| at every step => must be O(d)')

print('\nTakeaway: Coupling layers make log|det J| = sum(s(x1)) tractable — '
      'from O(d^3) to O(d). This is the key design choice in RealNVP and Glow.')

print("Exercise 8 solution complete.")

Exercise 9 (★★★): Hessian-Vector Products Without Forming the Hessian

For

f(x)=\sum_i \log(1+e^{a_i^\top x}),

compute $Hv$ by finite differences of gradients and compare with the exact Hessian-vector product.

Code cell 29

# Your Solution
# Exercise 9 - learner workspace
# Compare finite-difference HVP with an analytic HVP.
print("Learner workspace ready for Exercise 9.")

Code cell 30

# Solution
# Exercise 9 - Hessian-vector products
header("Exercise 9: Hessian-vector product")

A = np.array([[1.0, -0.5], [0.3, 0.8], [-1.2, 0.4]])
x = np.array([0.2, -0.7])
v = np.array([0.6, -0.1])

def grad_f(x):
    s = sigmoid(A @ x)
    return A.T @ s

def hvp_exact(x, v):
    s = sigmoid(A @ x)
    weights = s * (1 - s)
    return A.T @ (weights * (A @ v))

def hvp_fd(x, v, h=1e-5):
    return (grad_f(x + h*v) - grad_f(x - h*v)) / (2*h)

exact = hvp_exact(x, v)
fd = hvp_fd(x, v)
print("exact HVP:", exact)
print("finite-difference HVP:", fd)
check_close("HVP match", exact, fd, tol=1e-6)
print("Takeaway: second-order optimizers often need HVPs, not explicit Hessian matrices.")

Exercise 10 (★★★): Gauss-Newton Approximation for Least Squares

For residuals $r_i(\theta)$ and loss $L(\theta)=\frac{1}{2}\|r(\theta)\|_2^2$ , the Gauss-Newton matrix is $G=J^\top J$ .

Compute $J$ , $G$ , and compare a Gauss-Newton step with a gradient descent step.

Code cell 32

# Your Solution
# Exercise 10 - learner workspace
# Build residuals, Jacobian, and compare update directions.
print("Learner workspace ready for Exercise 10.")

Code cell 33

# Solution
# Exercise 10 - Gauss-Newton step
header("Exercise 10: Gauss-Newton least squares")

xs = np.array([-1.0, 0.0, 1.0, 2.0])
y_obs = np.array([0.2, 1.0, 2.7, 5.1])
theta = np.array([1.0, 0.5])  # model theta0 + theta1*x^2

def residual(theta):
    return theta[0] + theta[1] * xs**2 - y_obs

def jac(theta):
    return np.column_stack([np.ones_like(xs), xs**2])

r = residual(theta)
J = jac(theta)
grad = J.T @ r
G = J.T @ J
step_gn = -la.solve(G + 1e-8*np.eye(2), grad)
step_gd = -0.1 * grad
print("gradient:", grad)
print("Gauss-Newton step:", step_gn)
print("gradient descent step:", step_gd)
check_true("Gauss-Newton decreases loss", 0.5*la.norm(residual(theta+step_gn))**2 < 0.5*la.norm(r)**2)
print("Takeaway: Gauss-Newton uses Jacobian geometry to scale residual-fitting updates.")

What to Review After Finishing

Exercise 1 — Jacobian rows = output gradients, columns = input sensitivities
Exercise 2 — Softmax Jacobian is $\text{diag}(\mathbf{p})-\mathbf{p}\mathbf{p}^\top$ , rank $K-1$
Exercise 3 — Hessian eigenvalues classify critical points geometrically
Exercise 4 — LayerNorm removes 2 degrees of freedom, rank $d-2$
Exercise 5 — HVPs enable $\lambda_{\max}$ estimation without forming $H$
Exercise 6 — Newton converges quadratically vs linear for GD
Exercise 7 — K-FAC: Kronecker structure reduces inversion from $O(n^6)$ to $O(n^3)$
Exercise 8 — Triangular Jacobians make $\log|\det J|$ tractable in $O(d)$

References

Pearlmutter (1994) — Fast Hessian-vector products
Martens & Grosse (2015) — K-FAC
Foret et al. (2021) — SAM
Dinh et al. (2017) — RealNVP (coupling layers)
Cohen et al. (2022) — Edge of stability