Exercises NotebookMath for LLMs

Jacobians and Hessians

Multivariate Calculus / Jacobians and Hessians

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Jacobians and Hessians - Exercises

10 graded exercises covering the full section arc, from multivariate calculus mechanics to ML-facing gradient systems.

FormatDescription
ProblemMarkdown cell with task description
Your SolutionCode cell for learner work
SolutionReference solution with checks

Difficulty: straightforward -> moderate -> challenging.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize

try:
    import torch
    HAS_TORCH = True
except ImportError:
    torch = None
    HAS_TORCH = False


def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok

def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

def sigmoid(x):
    x = np.asarray(x, dtype=float)
    return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z, axis=axis, keepdims=True)
    e = np.exp(z)
    return e / np.sum(e, axis=axis, keepdims=True)

def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return np.where(np.asarray(x) > 0, 1.0, 0.0)

def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def numerical_gradient(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        xp = x.copy(); xm = x.copy()
        xp[idx] += h; xm[idx] -= h
        grad[idx] = (f(xp) - f(xm)) / (2*h)
    return grad

def numerical_jacobian(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
    return J.reshape(y0.shape + x.shape)

def grad_check(f, x, analytic_grad, h=1e-6):
    numeric_grad = numerical_gradient(f, x, h=h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def jacobian_fd(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
        ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
        J[:, j] = (yp - ym) / (2*h)
    return J.reshape(y0.shape + x.shape)

def hessian_fd(f, x, h=1e-5):
    x = np.asarray(x, dtype=float)
    H = np.zeros((x.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
        gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
        H[:, j] = (gp - gm) / (2*h)
    return H.reshape(x.shape + x.shape)



def grad_fd(f, x, h=1e-6):
    return numerical_gradient(f, x, h=h)



def fd_grad(f, x, h=1e-6):
    return numerical_gradient(f, np.asarray(x, dtype=float), h=h)

print("Chapter helper setup complete.")

Exercise 1 (★): Jacobian Computation and JVP/VJP

Let f:R3R2f: \mathbb{R}^3 \to \mathbb{R}^2 be f(x1,x2,x3)=(x12+x2x3,  ex1x32)f(x_1, x_2, x_3) = (x_1^2 + x_2 x_3,\; e^{x_1} - x_3^2).

(a) Compute Jf(x)J_f(\mathbf{x}) analytically for general x\mathbf{x}.

(b) Evaluate JfJ_f at x0=(0,1,1)\mathbf{x}_0 = (0, 1, 1)^\top and verify against finite differences.

(c) Compute the JVP JfvJ_f \mathbf{v} at x0\mathbf{x}_0 for v=(1,0,1)\mathbf{v} = (1, 0, -1)^\top.

(d) Compute the VJP JfuJ_f^\top \mathbf{u} at x0\mathbf{x}_0 for u=(1,2)\mathbf{u} = (1, 2)^\top.

(e) Verify the duality: u(Jfv)=(Jfu)v\mathbf{u}^\top(J_f\mathbf{v}) = (J_f^\top\mathbf{u})^\top\mathbf{v}.

Code cell 5

# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")

Code cell 6

# Solution
# Exercise 1 - reference solution

def f_ex1(x):
    return np.array([x[0]**2 + x[1]*x[2],
                     np.exp(x[0]) - x[2]**2])

def jacobian_ex1(x):
    """Analytical Jacobian of f_ex1."""
    return np.array([[2*x[0],   x[2],    x[1]],
                     [np.exp(x[0]), 0.0, -2*x[2]]])

x0 = np.array([0.0, 1.0, 1.0])
v  = np.array([1.0, 0.0, -1.0])
u  = np.array([1.0, 2.0])

J = jacobian_ex1(x0)
J_fd = jacobian_fd(f_ex1, x0)

header('Exercise 1: Jacobian Computation')
print('Jacobian (analytical):')
print(J)

check_close('Jacobian matches FD', J, J_fd, tol=1e-7)

# (c) JVP
jvp = J @ v
print(f'\nJVP J@v = {jvp}')
jvp_expected = np.array([2*0 + 1*(-1), np.exp(0)*1 - 2*1*(-1)])
# = [0 + 0 - 1, 1 + 2] = [-1, 3]
jvp_expected = np.array([-1.0, 3.0])
check_close('JVP correct', jvp, jvp_expected)

# (d) VJP
vjp = J.T @ u
print(f'VJP J^T@u = {vjp}')
# = [2*0*1 + exp(0)*2, 1*1 + 0*2, 1*1 + (-2)*2] = [2, 1, -3]
vjp_expected = np.array([2.0, 1.0, -3.0])
check_close('VJP correct', vjp, vjp_expected)

# (e) Duality
lhs = u @ (J @ v)   # u^T (Jv)
rhs = vjp @ v       # (J^T u)^T v
check_close('Duality u^T(Jv) = (J^Tu)^Tv', lhs, rhs)

print('\nTakeaway: JVP and VJP are transposes — backprop is repeated VJP, costing O(1) passes for scalar loss.')

print("Exercise 1 solution complete.")

Exercise 2 (★): Softmax Jacobian Properties

Let σ:RKRK\sigma: \mathbb{R}^K \to \mathbb{R}^K be softmax, p=σ(z)\mathbf{p} = \sigma(\mathbf{z}).

(a) For z=(1,2,0)\mathbf{z} = (1, 2, 0)^\top, compute p\mathbf{p} and Jσ=diag(p)ppJ_\sigma = \text{diag}(\mathbf{p}) - \mathbf{p}\mathbf{p}^\top.

(b) Verify Jσ1=0J_\sigma \mathbf{1} = \mathbf{0} and that JσJ_\sigma is symmetric.

(c) Compute the eigenvalues of JσJ_\sigma and confirm rank =K1= K-1.

(d) Derive the cross-entropy gradient: for L=logpy\mathcal{L} = -\log p_y, show L/z=pey\partial\mathcal{L}/\partial\mathbf{z} = \mathbf{p} - \mathbf{e}_y.

(e) Verify part (d) numerically for y=1y = 1 (second class).

Code cell 8

# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")

Code cell 9

# Solution
# Exercise 2 - reference solution

z = np.array([1.0, 2.0, 0.0])
K = len(z)

def softmax_jacobian(z):
    p = softmax(z)
    return np.diag(p) - np.outer(p, p)

p = softmax(z)
J = softmax_jacobian(z)

header('Exercise 2: Softmax Jacobian')
print(f'p = {p.round(6)}')
print('J_sigma:')
print(J.round(6))

# (b)
ones = np.ones(K)
check_close('J @ 1 = 0 (null space)', J @ ones, np.zeros(K), tol=1e-10)
check_true('J is symmetric', np.allclose(J, J.T))

# (c)
eigvals = np.sort(la.eigvalsh(J))[::-1]
print(f'Eigenvalues: {eigvals.round(6)}')
rank_J = la.matrix_rank(J)
check_true(f'Rank = K-1 = {K-1}', rank_J == K-1)
check_true('All eigenvalues >= 0 (PSD)', np.all(eigvals >= -1e-10))

# (d)+(e): Cross-entropy gradient
y_class = 1  # index of true class
e_y = np.zeros(K); e_y[y_class] = 1.0

# Analytical: grad_z L = p - e_y
ce_grad_ana = p - e_y

# Numerical: L = -log(p_y) = -log(softmax(z)[y])
def ce_loss(z, y=y_class):
    return -np.log(softmax(z)[y])

ce_grad_fd = jacobian_fd(lambda z: np.array([ce_loss(z)]), z)[0]

print(f'\nCE gradient (analytical p-e_y): {ce_grad_ana.round(6)}')
print(f'CE gradient (FD):               {ce_grad_fd.round(6)}')
check_close('CE gradient = p - e_y', ce_grad_ana, ce_grad_fd, tol=1e-6)

print('\nTakeaway: The clean gradient p-e_y comes from the softmax Jacobian structure.')

print("Exercise 2 solution complete.")

Exercise 3 (★): Hessian and Critical Point Classification

Let f(x,y)=x3+3xyy3f(x, y) = x^3 + 3xy - y^3.

(a) Solve f=0\nabla f = 0 to find all critical points.

(b) Compute HfH_f at each critical point.

(c) Classify each point (local min, max, or saddle) using eigenvalues.

(d) For the saddle point, plot the contour and the two eigenvector directions showing curvature up and curvature down.

Code cell 11

# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")

Code cell 12

# Solution
# Exercise 3 - reference solution

def f_ex3(x):
    return x[0]**3 + 3*x[0]*x[1] - x[1]**3

# Critical points:
# df/dx = 3x^2 + 3y = 0  =>  y = -x^2
# df/dy = 3x - 3y^2 = 0  =>  x = y^2 = x^4  =>  x(x^3-1)=0
# => x=0, y=0  OR  x=1, y=-1
critical_points = [np.array([0., 0.]), np.array([1., -1.])]

header('Exercise 3: Critical Point Classification')
for cp in critical_points:
    H = hessian_fd(f_ex3, cp)
    eigvals = np.sort(la.eigvalsh(H))
    grad_at_cp = np.array([3*cp[0]**2 + 3*cp[1], 3*cp[0] - 3*cp[1]**2])

    if np.all(eigvals > 0):
        ctype = 'LOCAL MINIMUM'
    elif np.all(eigvals < 0):
        ctype = 'LOCAL MAXIMUM'
    else:
        ctype = 'SADDLE POINT'

    print(f'\nPoint {cp}:')
    print(f'  grad = {grad_at_cp.round(8)}')
    print(f'  eigenvalues = {eigvals.round(6)}')
    print(f'  Type: {ctype}')

    check_true(f'gradient ~ 0 at {cp}', la.norm(grad_at_cp) < 1e-8)

# (d) Visualize saddle
if HAS_MPL:
    cp_sad = np.array([1., -1.])
    H_sad = hessian_fd(f_ex3, cp_sad)
    evals, evecs = la.eigh(H_sad)

    xx, yy = np.meshgrid(np.linspace(0.2, 1.8, 60), np.linspace(-1.8, -0.2, 60))
    zz = xx**3 + 3*xx*yy - yy**3

    fig, ax = plt.subplots(figsize=(8, 7))
    cf = ax.contourf(xx, yy, zz, levels=25, cmap='RdBu_r', alpha=0.8)
    plt.colorbar(cf, ax=ax)
    ax.contour(xx, yy, zz, levels=25, colors='white', linewidths=0.4, alpha=0.4)
    ax.plot(*cp_sad, 'ko', ms=12, label='Saddle (1,-1)')

    scale = 0.35
    for ev, vec in zip(evals, evecs.T):
        c = 'lime' if ev > 0 else 'red'
        lab = f'λ={ev:.2f} ({"↑" if ev>0 else "↓"}curv)'
        ax.annotate('', xy=cp_sad+scale*vec, xytext=cp_sad,
                    arrowprops=dict(arrowstyle='->', color=c, lw=2.5))

    ax.set_xlabel('x'); ax.set_ylabel('y')
    ax.set_title('f(x,y) = x³+3xy-y³: Saddle at (1,-1)')
    plt.tight_layout(); plt.show()
    print('Figure: green=curvature up (min direction), red=curvature down (saddle direction)')

print('\nTakeaway: Hessian eigenvalues classify critical points geometrically.')

print("Exercise 3 solution complete.")

Exercise 4 (★★): LayerNorm Jacobian

LayerNorm: LN(x)=(xxˉ1)/σ\text{LN}(\mathbf{x}) = (\mathbf{x} - \bar{x}\mathbf{1})/\sigma

(a) Derive and implement JLN=1σ(I1d111dx^x^)J_{\text{LN}} = \frac{1}{\sigma}(I - \frac{1}{d}\mathbf{1}\mathbf{1}^\top - \frac{1}{d}\hat{\mathbf{x}}\hat{\mathbf{x}}^\top).

(b) Verify against finite differences for x=(1,2,3,4)\mathbf{x} = (1, 2, 3, 4)^\top.

(c) Show 1\mathbf{1} and x^\hat{\mathbf{x}} are in the null space.

(d) Compute the rank of JLNJ_{\text{LN}} and explain what it means geometrically.

Code cell 14

# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")

Code cell 15

# Solution
# Exercise 4 - reference solution

def layernorm(x):
    mu = x.mean()
    sigma = np.sqrt(((x-mu)**2).mean())
    return (x - mu) / sigma

def layernorm_jacobian(x):
    d = len(x)
    mu = x.mean()
    sigma = np.sqrt(((x-mu)**2).mean())
    x_hat = (x - mu) / sigma
    I = np.eye(d)
    return (1/sigma) * (I - np.ones((d,d))/d - np.outer(x_hat, x_hat)/d)

x = np.array([1.0, 2.0, 3.0, 4.0])
d = len(x)

J_ln = layernorm_jacobian(x)
J_ln_fd = jacobian_fd(layernorm, x)

header('Exercise 4: LayerNorm Jacobian')
print('J_LN (analytical):')
print(J_ln.round(6))

check_close('J_LN matches FD', J_ln, J_ln_fd, tol=1e-6)

# (c) Null space
ones_vec = np.ones(d)
mu = x.mean(); sigma = np.sqrt(((x-mu)**2).mean())
x_hat = (x - mu) / sigma

check_close('J_LN @ 1 = 0', J_ln @ ones_vec, np.zeros(d), tol=1e-10)
check_close('J_LN @ x_hat = 0', J_ln @ x_hat, np.zeros(d), tol=1e-10)

# (d) Rank
rank_ln = la.matrix_rank(J_ln)
check_true(f'rank = d-2 = {d-2}', rank_ln == d-2)

print(f'\nRank = {rank_ln}: LayerNorm cannot distinguish inputs that differ')
print('by adding a constant (direction 1) or scaling (direction x_hat).')
print('\nTakeaway: LayerNorm removes mean and variance information — its Jacobian has rank d-2.')

print("Exercise 4 solution complete.")

Exercise 5 (★★): HVPs and Edge of Stability

MSE loss: f(w)=12mXwy2f(\mathbf{w}) = \frac{1}{2m}\|X\mathbf{w} - \mathbf{y}\|^2, Hessian H=1mXXH = \frac{1}{m}X^\top X.

(a) Implement the HVP Hv=1mX(Xv)H\mathbf{v} = \frac{1}{m}X^\top(X\mathbf{v}) (matrix-free).

(b) Use power iteration (10 steps) to estimate λmax(H)\lambda_{\max}(H).

(c) Compute the max stable learning rate ηmax=2/λmax\eta_{\max} = 2/\lambda_{\max}.

(d) Run GD with η=0.9ηmax\eta = 0.9\eta_{\max} (converges) and η=1.1ηmax\eta = 1.1\eta_{\max} (diverges).

(e) Plot both loss curves and explain the edge of stability.

Code cell 17

# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")

Code cell 18

# Solution
# Exercise 5 - reference solution

np.random.seed(42)
m_data, n_feat = 40, 8
X = np.random.randn(m_data, n_feat)
true_w = np.random.randn(n_feat)
y = X @ true_w + 0.05*np.random.randn(m_data)

def mse_loss(w):
    return 0.5 * np.mean((X @ w - y)**2)

def mse_grad(w):
    return X.T @ (X @ w - y) / m_data

def hvp_mse(v):
    """HVP for MSE: H@v = X^T(Xv)/m."""
    return X.T @ (X @ v) / m_data

def power_iter(hvp_fn, n, n_iters=20, seed=1):
    np.random.seed(seed)
    v = np.random.randn(n)
    v = v / la.norm(v)
    lam = 0.0
    for _ in range(n_iters):
        hv = hvp_fn(v)
        lam = v @ hv
        v = hv / la.norm(hv)
    return lam

lam_max = power_iter(hvp_mse, n_feat)
lam_max_true = la.eigvalsh(X.T @ X / m_data)[-1]
eta_max = 2.0 / lam_max

header('Exercise 5: HVPs and Edge of Stability')
print(f'lambda_max (power iter): {lam_max:.6f}')
print(f'lambda_max (exact):      {lam_max_true:.6f}')
check_close('Power iter matches exact', lam_max, lam_max_true, tol=0.01)

print(f'Max stable lr: eta_max = {eta_max:.6f}')

# (d) GD stable vs unstable
n_steps = 200
losses_stable, losses_unstable = [], []
w_stable = w_unstable = np.zeros(n_feat)

for _ in range(n_steps):
    w_stable   = w_stable   - 0.9*eta_max * mse_grad(w_stable)
    w_unstable = w_unstable - 1.1*eta_max * mse_grad(w_unstable)
    losses_stable.append(mse_loss(w_stable))
    losses_unstable.append(min(mse_loss(w_unstable), 1e8))

check_true('Stable GD converges', losses_stable[-1] < 0.01)
check_true('Unstable GD diverges', losses_unstable[-1] > 1e4)

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(losses_stable, 'g-', lw=2,
                label=f'η=0.9×η_max (converges)')
    ax.semilogy(losses_unstable, 'r-', lw=2,
                label=f'η=1.1×η_max (diverges)')
    ax.set_xlabel('Steps'); ax.set_ylabel('Loss (log)')
    ax.set_title(f'Edge of Stability: η_max = {eta_max:.4f}')
    ax.legend(); plt.tight_layout(); plt.show()

print('\nTakeaway: lambda_max determines the maximum stable learning rate. '
      'Power iteration (20 HVPs) estimates it without forming H.')

print("Exercise 5 solution complete.")

Exercise 6 (★★): Newton's Method Convergence

Minimise g(x)=log(1+ex)2xg(x) = \log(1+e^x) - 2x (minimum at x=log3x^* = \log 3).

(a) Compute g(x)=σ(x)2g'(x) = \sigma(x) - 2 and g(x)=σ(x)(1σ(x))g''(x) = \sigma(x)(1-\sigma(x)).

(b) Run 10 steps of Newton's method from x0=5x_0 = 5.

(c) Run 100 steps of gradient descent with η=0.5\eta = 0.5 from x0=5x_0 = 5.

(d) Plot both error sequences on a log scale and verify Newton's quadratic convergence.

(e) Compute the Newton decrement λN=(g(x))2/g(x)\lambda_N = \sqrt{(g'(x))^2/g''(x)} at each step.

Code cell 20

# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")

Code cell 21

# Solution
# Exercise 6 - reference solution

def sigmoid(x): return 1/(1+np.exp(-x))
def g(x): return np.log(1+np.exp(x)) - 2*x
def g_prime(x): return sigmoid(x) - 2
def g_double_prime(x): s=sigmoid(x); return s*(1-s)

x_star = np.log(3.0)
x0 = 5.0

# Newton's method
x = x0
errors_newton = [abs(x - x_star)]
decrements_newton = []
for _ in range(10):
    decrement = np.sqrt(g_prime(x)**2 / g_double_prime(x))
    decrements_newton.append(decrement)
    x = x - g_prime(x) / g_double_prime(x)  # Newton step
    errors_newton.append(abs(x - x_star))

# Gradient descent
x_gd = x0
errors_gd = [abs(x_gd - x_star)]
eta_gd = 0.5
for _ in range(100):
    x_gd = x_gd - eta_gd * g_prime(x_gd)
    errors_gd.append(abs(x_gd - x_star))

header('Exercise 6: Newton vs GD Convergence')
print('Newton errors:')
for i, e in enumerate(errors_newton[:8]):
    print(f'  step {i}: {e:.3e}')

# Quadratic convergence check: log|e_{t+1}| ≈ 2 * log|e_t| near optimum
valid = [(errors_newton[i], errors_newton[i+1])
         for i in range(len(errors_newton)-1)
         if errors_newton[i] > 1e-12 and errors_newton[i+1] > 1e-12]

if len(valid) >= 2:
    e_t, e_t1 = valid[-2]
    ratio = np.log(e_t1+1e-20) / (np.log(e_t+1e-20) + 1e-20)
    print(f'\nLog-error ratio (should be ~2 for quadratic convergence): {ratio:.2f}')
    check_true('Quadratic convergence (ratio > 1.5)', ratio > 1.5)

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(errors_newton, 'b-o', ms=6, lw=2, label='Newton (10 steps)')
    ax.semilogy(errors_gd[:20], 'r-', lw=2, label='GD η=0.5 (first 20 steps)')
    ax.set_xlabel('Step'); ax.set_ylabel('|x_t - x*|')
    ax.set_title('Newton (quadratic) vs GD (linear) Convergence')
    ax.legend(); plt.tight_layout(); plt.show()

print('\nTakeaway: Newton converges in ~5 steps (quadratic). GD needs hundreds (linear).')

print("Exercise 6 solution complete.")

Exercise 7 (★★★): K-FAC Kronecker Factorisation

Linear layer y=Wx\mathbf{y} = W\mathbf{x}, WRnout×ninW \in \mathbb{R}^{n_{\text{out}}\times n_{\text{in}}}.

(a) Show that vec(WL)=xg\text{vec}(\nabla_W\mathcal{L}) = \mathbf{x} \otimes \mathbf{g} where g=L/y\mathbf{g} = \partial\mathcal{L}/\partial\mathbf{y}.

(b) Compute the exact FIM block from 100 samples and the K-FAC approximation AGA \otimes G.

(c) Verify (AG)1=A1G1(A\otimes G)^{-1} = A^{-1}\otimes G^{-1}.

(d) Compute the K-FAC natural gradient step ΔW=G1(W)A1\Delta W = -G^{-1}(\nabla_W)A^{-1} and verify it matches (A1G1)vec(W)(A^{-1}\otimes G^{-1})\text{vec}(\nabla_W).

(e) Compare the computational cost: direct FIM inversion vs K-FAC inversion.

Code cell 23

# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")

Code cell 24

# Solution
# Exercise 7 - reference solution

n_in_k, n_out_k = 5, 4
n_samp_k = 150
np.random.seed(42)

activations_k = np.random.randn(n_samp_k, n_in_k)
back_grads_k  = np.random.randn(n_samp_k, n_out_k) * 0.3

# (a) Verify vec(∇_W L) = x ⊗ g for one sample
a_i = activations_k[0]
g_i = back_grads_k[0]
grad_W_i = np.outer(g_i, a_i)  # (n_out, n_in)
kron_ag = np.kron(a_i, g_i)    # (n_in * n_out,)
vec_grad_W_i = grad_W_i.T.flatten()  # column-major vec

header('Exercise 7: K-FAC')
print('(a) Verify vec(∇_W) = x ⊗ g:')
check_close('kron(a,g) = vec(∇_W)', kron_ag, vec_grad_W_i)

# (b) Exact FIM block
dim_fim = n_in_k * n_out_k
fim_exact_k = np.zeros((dim_fim, dim_fim))
for i in range(n_samp_k):
    a = activations_k[i]; g = back_grads_k[i]
    v = np.kron(a, g)
    fim_exact_k += np.outer(v, v)
fim_exact_k /= n_samp_k

# K-FAC approximation
A_k = activations_k.T @ activations_k / n_samp_k
G_k = back_grads_k.T  @ back_grads_k  / n_samp_k
fim_kfac_k = np.kron(A_k, G_k)

rel_err_k = la.norm(fim_exact_k - fim_kfac_k) / la.norm(fim_exact_k)
print(f'\n(b) FIM shape: {fim_exact_k.shape}')
print(f'K-FAC relative error: {rel_err_k:.4f}')

# (c) (A⊗G)^{-1} = A^{-1} ⊗ G^{-1}
dam = 1e-3
A_inv_k = la.inv(A_k + dam*np.eye(n_in_k))
G_inv_k = la.inv(G_k + dam*np.eye(n_out_k))
kfac_inv_k = np.kron(A_inv_k, G_inv_k)

fim_kfac_damped = fim_kfac_k + dam*np.eye(dim_fim)
direct_inv_k = la.inv(fim_kfac_damped)

check_close('(c) (A⊗G)^-1 = A^-1⊗G^-1', kfac_inv_k, direct_inv_k, tol=1e-5)

# (d) K-FAC natural gradient step
np.random.seed(7)
grad_W_test = np.random.randn(n_out_k, n_in_k)

# Matrix form
delta_W_matrix = G_inv_k @ grad_W_test @ A_inv_k

# Vec form: (A^-1 ⊗ G^-1) vec(∇W)
vec_grad = grad_W_test.T.flatten()
delta_W_vec = (kfac_inv_k @ vec_grad).reshape(n_in_k, n_out_k).T

check_close('(d) matrix form = vec form', delta_W_matrix, delta_W_vec, tol=1e-8)

# (e) Cost comparison
print(f'\n(e) Computational cost comparison:')
print(f'  Direct FIM inversion: O((n_in*n_out)^3) = O({(n_in_k*n_out_k)**3})')
print(f'  K-FAC inversion: O(n_in^3 + n_out^3) = O({n_in_k**3 + n_out_k**3})')
speedup = (n_in_k*n_out_k)**3 / (n_in_k**3 + n_out_k**3)
print(f'  Speedup factor: {speedup:.1f}x (grows as min(m,n)^2)')

print('\nTakeaway: K-FAC uses Kronecker structure to reduce inversion from O(n^6) to O(n^3).')

print("Exercise 7 solution complete.")

Exercise 8 (★★★): Jacobian Determinant in Normalising Flows

RealNVP coupling layer: y1=x1\mathbf{y}_1 = \mathbf{x}_1, y2=x2es(x1)+t(x1)\mathbf{y}_2 = \mathbf{x}_2 \odot e^{s(\mathbf{x}_1)} + t(\mathbf{x}_1).

(a) Show the Jacobian is block lower-triangular.

(b) Show logdetJf=isi(x1)\log|\det J_f| = \sum_i s_i(\mathbf{x}_1).

(c) Verify numerically for a specific x\mathbf{x}.

(d) Implement the inverse and verify f1(f(x))=xf^{-1}(f(\mathbf{x})) = \mathbf{x}.

(e) Explain why triangular Jacobians are essential for scalable generative modelling.

Code cell 26

# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")

Code cell 27

# Solution
# Exercise 8 - reference solution

W_s = np.array([[0.5, -0.2], [0.3, 0.1]])
W_t = np.array([[0.1, 0.0], [0.0, -0.2]])

def s_net(x1): return W_s @ x1
def t_net(x1): return W_t @ x1

def coupling_fwd(x):
    x1, x2 = x[:2], x[2:]
    s, t = s_net(x1), t_net(x1)
    return np.concatenate([x1, x2 * np.exp(s) + t])

def coupling_inv(y):
    y1, y2 = y[:2], y[2:]
    x1 = y1.copy()
    s, t = s_net(x1), t_net(x1)
    x2 = (y2 - t) * np.exp(-s)
    return np.concatenate([x1, x2])

x_test = np.array([1.0, 0.5, -0.3, 2.0])

header('Exercise 8: Normalising Flow Jacobian')

# (b)+(c) Log-det verification
y_test = coupling_fwd(x_test)
J_coupling = jacobian_fd(coupling_fwd, x_test)

print('(a)+(b) Jacobian (should be lower-triangular):')
print(J_coupling.round(4))

upper_right = J_coupling[:2, 2:]
check_close('Upper-right block = 0 (triangular)', upper_right, np.zeros((2,2)), tol=1e-8)

log_det_fd = np.log(np.abs(la.det(J_coupling)))
s_at_x1 = s_net(x_test[:2])
log_det_formula = np.sum(s_at_x1)

print(f'\n(c) log|det J| (direct from J): {log_det_fd:.6f}')
print(f'    log|det J| (formula sum(s)):  {log_det_formula:.6f}')
check_close('log-det formula correct', log_det_fd, log_det_formula, tol=1e-6)

# (d) Invertibility
x_rec = coupling_inv(y_test)
check_close('(d) f^-1(f(x)) = x', x_rec, x_test, tol=1e-10)

# (e) Scalability argument
print('\n(e) Why triangular Jacobian matters:')
print('  For d=1000: direct det(J) = O(d^3) = O(10^9) operations')
print('  Triangular: det = product of diagonals = O(d) operations')
print('  Training NF via MLE: needs log|det J| at every step => must be O(d)')

print('\nTakeaway: Coupling layers make log|det J| = sum(s(x1)) tractable — '
      'from O(d^3) to O(d). This is the key design choice in RealNVP and Glow.')

print("Exercise 8 solution complete.")

Exercise 9 (★★★): Hessian-Vector Products Without Forming the Hessian

For

f(x)=ilog(1+eaix),f(x)=\sum_i \log(1+e^{a_i^\top x}),

compute HvHv by finite differences of gradients and compare with the exact Hessian-vector product.

Code cell 29

# Your Solution
# Exercise 9 - learner workspace
# Compare finite-difference HVP with an analytic HVP.
print("Learner workspace ready for Exercise 9.")

Code cell 30

# Solution
# Exercise 9 - Hessian-vector products
header("Exercise 9: Hessian-vector product")

A = np.array([[1.0, -0.5], [0.3, 0.8], [-1.2, 0.4]])
x = np.array([0.2, -0.7])
v = np.array([0.6, -0.1])

def grad_f(x):
    s = sigmoid(A @ x)
    return A.T @ s

def hvp_exact(x, v):
    s = sigmoid(A @ x)
    weights = s * (1 - s)
    return A.T @ (weights * (A @ v))

def hvp_fd(x, v, h=1e-5):
    return (grad_f(x + h*v) - grad_f(x - h*v)) / (2*h)

exact = hvp_exact(x, v)
fd = hvp_fd(x, v)
print("exact HVP:", exact)
print("finite-difference HVP:", fd)
check_close("HVP match", exact, fd, tol=1e-6)
print("Takeaway: second-order optimizers often need HVPs, not explicit Hessian matrices.")

Exercise 10 (★★★): Gauss-Newton Approximation for Least Squares

For residuals ri(θ)r_i(\theta) and loss L(θ)=12r(θ)22L(\theta)=\frac{1}{2}\|r(\theta)\|_2^2, the Gauss-Newton matrix is G=JJG=J^\top J.

Compute JJ, GG, and compare a Gauss-Newton step with a gradient descent step.

Code cell 32

# Your Solution
# Exercise 10 - learner workspace
# Build residuals, Jacobian, and compare update directions.
print("Learner workspace ready for Exercise 10.")

Code cell 33

# Solution
# Exercise 10 - Gauss-Newton step
header("Exercise 10: Gauss-Newton least squares")

xs = np.array([-1.0, 0.0, 1.0, 2.0])
y_obs = np.array([0.2, 1.0, 2.7, 5.1])
theta = np.array([1.0, 0.5])  # model theta0 + theta1*x^2

def residual(theta):
    return theta[0] + theta[1] * xs**2 - y_obs

def jac(theta):
    return np.column_stack([np.ones_like(xs), xs**2])

r = residual(theta)
J = jac(theta)
grad = J.T @ r
G = J.T @ J
step_gn = -la.solve(G + 1e-8*np.eye(2), grad)
step_gd = -0.1 * grad
print("gradient:", grad)
print("Gauss-Newton step:", step_gn)
print("gradient descent step:", step_gd)
check_true("Gauss-Newton decreases loss", 0.5*la.norm(residual(theta+step_gn))**2 < 0.5*la.norm(r)**2)
print("Takeaway: Gauss-Newton uses Jacobian geometry to scale residual-fitting updates.")

What to Review After Finishing

  • Exercise 1 — Jacobian rows = output gradients, columns = input sensitivities
  • Exercise 2 — Softmax Jacobian is diag(p)pp\text{diag}(\mathbf{p})-\mathbf{p}\mathbf{p}^\top, rank K1K-1
  • Exercise 3 — Hessian eigenvalues classify critical points geometrically
  • Exercise 4 — LayerNorm removes 2 degrees of freedom, rank d2d-2
  • Exercise 5 — HVPs enable λmax\lambda_{\max} estimation without forming HH
  • Exercise 6 — Newton converges quadratically vs linear for GD
  • Exercise 7 — K-FAC: Kronecker structure reduces inversion from O(n6)O(n^6) to O(n3)O(n^3)
  • Exercise 8 — Triangular Jacobians make logdetJ\log|\det J| tractable in O(d)O(d)

References

  1. Pearlmutter (1994) — Fast Hessian-vector products
  2. Martens & Grosse (2015) — K-FAC
  3. Foret et al. (2021) — SAM
  4. Dinh et al. (2017) — RealNVP (coupling layers)
  5. Cohen et al. (2022) — Edge of stability