Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Jacobians and Hessians - Exercises
10 graded exercises covering the full section arc, from multivariate calculus mechanics to ML-facing gradient systems.
| Format | Description |
|---|---|
| Problem | Markdown cell with task description |
| Your Solution | Code cell for learner work |
| Solution | Reference solution with checks |
Difficulty: straightforward -> moderate -> challenging.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize
try:
import torch
HAS_TORCH = True
except ImportError:
torch = None
HAS_TORCH = False
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def check_true(name, cond):
ok = bool(cond)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
return ok
def check(name, got, expected, tol=1e-8):
return check_close(name, got, expected, tol=tol)
def sigmoid(x):
x = np.asarray(x, dtype=float)
return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
z = z - np.max(z, axis=axis, keepdims=True)
e = np.exp(z)
return e / np.sum(e, axis=axis, keepdims=True)
def relu(x):
return np.maximum(0, x)
def relu_prime(x):
return np.where(np.asarray(x) > 0, 1.0, 0.0)
def centered_diff(f, x, h=1e-6):
return (f(x + h) - f(x - h)) / (2 * h)
def numerical_gradient(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
grad = np.zeros_like(x, dtype=float)
for idx in np.ndindex(x.shape):
xp = x.copy(); xm = x.copy()
xp[idx] += h; xm[idx] -= h
grad[idx] = (f(xp) - f(xm)) / (2*h)
return grad
def numerical_jacobian(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
y0 = np.asarray(f(x), dtype=float)
J = np.zeros((y0.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
return J.reshape(y0.shape + x.shape)
def grad_check(f, x, analytic_grad, h=1e-6):
numeric_grad = numerical_gradient(f, x, h=h)
denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
return la.norm(analytic_grad - numeric_grad) / denom
def jacobian_fd(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
y0 = np.asarray(f(x), dtype=float)
J = np.zeros((y0.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
J[:, j] = (yp - ym) / (2*h)
return J.reshape(y0.shape + x.shape)
def hessian_fd(f, x, h=1e-5):
x = np.asarray(x, dtype=float)
H = np.zeros((x.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
H[:, j] = (gp - gm) / (2*h)
return H.reshape(x.shape + x.shape)
def grad_fd(f, x, h=1e-6):
return numerical_gradient(f, x, h=h)
def fd_grad(f, x, h=1e-6):
return numerical_gradient(f, np.asarray(x, dtype=float), h=h)
print("Chapter helper setup complete.")
Exercise 1 (★): Jacobian Computation and JVP/VJP
Let be .
(a) Compute analytically for general .
(b) Evaluate at and verify against finite differences.
(c) Compute the JVP at for .
(d) Compute the VJP at for .
(e) Verify the duality: .
Code cell 5
# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")
Code cell 6
# Solution
# Exercise 1 - reference solution
def f_ex1(x):
return np.array([x[0]**2 + x[1]*x[2],
np.exp(x[0]) - x[2]**2])
def jacobian_ex1(x):
"""Analytical Jacobian of f_ex1."""
return np.array([[2*x[0], x[2], x[1]],
[np.exp(x[0]), 0.0, -2*x[2]]])
x0 = np.array([0.0, 1.0, 1.0])
v = np.array([1.0, 0.0, -1.0])
u = np.array([1.0, 2.0])
J = jacobian_ex1(x0)
J_fd = jacobian_fd(f_ex1, x0)
header('Exercise 1: Jacobian Computation')
print('Jacobian (analytical):')
print(J)
check_close('Jacobian matches FD', J, J_fd, tol=1e-7)
# (c) JVP
jvp = J @ v
print(f'\nJVP J@v = {jvp}')
jvp_expected = np.array([2*0 + 1*(-1), np.exp(0)*1 - 2*1*(-1)])
# = [0 + 0 - 1, 1 + 2] = [-1, 3]
jvp_expected = np.array([-1.0, 3.0])
check_close('JVP correct', jvp, jvp_expected)
# (d) VJP
vjp = J.T @ u
print(f'VJP J^T@u = {vjp}')
# = [2*0*1 + exp(0)*2, 1*1 + 0*2, 1*1 + (-2)*2] = [2, 1, -3]
vjp_expected = np.array([2.0, 1.0, -3.0])
check_close('VJP correct', vjp, vjp_expected)
# (e) Duality
lhs = u @ (J @ v) # u^T (Jv)
rhs = vjp @ v # (J^T u)^T v
check_close('Duality u^T(Jv) = (J^Tu)^Tv', lhs, rhs)
print('\nTakeaway: JVP and VJP are transposes — backprop is repeated VJP, costing O(1) passes for scalar loss.')
print("Exercise 1 solution complete.")
Exercise 2 (★): Softmax Jacobian Properties
Let be softmax, .
(a) For , compute and .
(b) Verify and that is symmetric.
(c) Compute the eigenvalues of and confirm rank .
(d) Derive the cross-entropy gradient: for , show .
(e) Verify part (d) numerically for (second class).
Code cell 8
# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")
Code cell 9
# Solution
# Exercise 2 - reference solution
z = np.array([1.0, 2.0, 0.0])
K = len(z)
def softmax_jacobian(z):
p = softmax(z)
return np.diag(p) - np.outer(p, p)
p = softmax(z)
J = softmax_jacobian(z)
header('Exercise 2: Softmax Jacobian')
print(f'p = {p.round(6)}')
print('J_sigma:')
print(J.round(6))
# (b)
ones = np.ones(K)
check_close('J @ 1 = 0 (null space)', J @ ones, np.zeros(K), tol=1e-10)
check_true('J is symmetric', np.allclose(J, J.T))
# (c)
eigvals = np.sort(la.eigvalsh(J))[::-1]
print(f'Eigenvalues: {eigvals.round(6)}')
rank_J = la.matrix_rank(J)
check_true(f'Rank = K-1 = {K-1}', rank_J == K-1)
check_true('All eigenvalues >= 0 (PSD)', np.all(eigvals >= -1e-10))
# (d)+(e): Cross-entropy gradient
y_class = 1 # index of true class
e_y = np.zeros(K); e_y[y_class] = 1.0
# Analytical: grad_z L = p - e_y
ce_grad_ana = p - e_y
# Numerical: L = -log(p_y) = -log(softmax(z)[y])
def ce_loss(z, y=y_class):
return -np.log(softmax(z)[y])
ce_grad_fd = jacobian_fd(lambda z: np.array([ce_loss(z)]), z)[0]
print(f'\nCE gradient (analytical p-e_y): {ce_grad_ana.round(6)}')
print(f'CE gradient (FD): {ce_grad_fd.round(6)}')
check_close('CE gradient = p - e_y', ce_grad_ana, ce_grad_fd, tol=1e-6)
print('\nTakeaway: The clean gradient p-e_y comes from the softmax Jacobian structure.')
print("Exercise 2 solution complete.")
Exercise 3 (★): Hessian and Critical Point Classification
Let .
(a) Solve to find all critical points.
(b) Compute at each critical point.
(c) Classify each point (local min, max, or saddle) using eigenvalues.
(d) For the saddle point, plot the contour and the two eigenvector directions showing curvature up and curvature down.
Code cell 11
# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")
Code cell 12
# Solution
# Exercise 3 - reference solution
def f_ex3(x):
return x[0]**3 + 3*x[0]*x[1] - x[1]**3
# Critical points:
# df/dx = 3x^2 + 3y = 0 => y = -x^2
# df/dy = 3x - 3y^2 = 0 => x = y^2 = x^4 => x(x^3-1)=0
# => x=0, y=0 OR x=1, y=-1
critical_points = [np.array([0., 0.]), np.array([1., -1.])]
header('Exercise 3: Critical Point Classification')
for cp in critical_points:
H = hessian_fd(f_ex3, cp)
eigvals = np.sort(la.eigvalsh(H))
grad_at_cp = np.array([3*cp[0]**2 + 3*cp[1], 3*cp[0] - 3*cp[1]**2])
if np.all(eigvals > 0):
ctype = 'LOCAL MINIMUM'
elif np.all(eigvals < 0):
ctype = 'LOCAL MAXIMUM'
else:
ctype = 'SADDLE POINT'
print(f'\nPoint {cp}:')
print(f' grad = {grad_at_cp.round(8)}')
print(f' eigenvalues = {eigvals.round(6)}')
print(f' Type: {ctype}')
check_true(f'gradient ~ 0 at {cp}', la.norm(grad_at_cp) < 1e-8)
# (d) Visualize saddle
if HAS_MPL:
cp_sad = np.array([1., -1.])
H_sad = hessian_fd(f_ex3, cp_sad)
evals, evecs = la.eigh(H_sad)
xx, yy = np.meshgrid(np.linspace(0.2, 1.8, 60), np.linspace(-1.8, -0.2, 60))
zz = xx**3 + 3*xx*yy - yy**3
fig, ax = plt.subplots(figsize=(8, 7))
cf = ax.contourf(xx, yy, zz, levels=25, cmap='RdBu_r', alpha=0.8)
plt.colorbar(cf, ax=ax)
ax.contour(xx, yy, zz, levels=25, colors='white', linewidths=0.4, alpha=0.4)
ax.plot(*cp_sad, 'ko', ms=12, label='Saddle (1,-1)')
scale = 0.35
for ev, vec in zip(evals, evecs.T):
c = 'lime' if ev > 0 else 'red'
lab = f'λ={ev:.2f} ({"↑" if ev>0 else "↓"}curv)'
ax.annotate('', xy=cp_sad+scale*vec, xytext=cp_sad,
arrowprops=dict(arrowstyle='->', color=c, lw=2.5))
ax.set_xlabel('x'); ax.set_ylabel('y')
ax.set_title('f(x,y) = x³+3xy-y³: Saddle at (1,-1)')
plt.tight_layout(); plt.show()
print('Figure: green=curvature up (min direction), red=curvature down (saddle direction)')
print('\nTakeaway: Hessian eigenvalues classify critical points geometrically.')
print("Exercise 3 solution complete.")
Exercise 4 (★★): LayerNorm Jacobian
LayerNorm:
(a) Derive and implement .
(b) Verify against finite differences for .
(c) Show and are in the null space.
(d) Compute the rank of and explain what it means geometrically.
Code cell 14
# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")
Code cell 15
# Solution
# Exercise 4 - reference solution
def layernorm(x):
mu = x.mean()
sigma = np.sqrt(((x-mu)**2).mean())
return (x - mu) / sigma
def layernorm_jacobian(x):
d = len(x)
mu = x.mean()
sigma = np.sqrt(((x-mu)**2).mean())
x_hat = (x - mu) / sigma
I = np.eye(d)
return (1/sigma) * (I - np.ones((d,d))/d - np.outer(x_hat, x_hat)/d)
x = np.array([1.0, 2.0, 3.0, 4.0])
d = len(x)
J_ln = layernorm_jacobian(x)
J_ln_fd = jacobian_fd(layernorm, x)
header('Exercise 4: LayerNorm Jacobian')
print('J_LN (analytical):')
print(J_ln.round(6))
check_close('J_LN matches FD', J_ln, J_ln_fd, tol=1e-6)
# (c) Null space
ones_vec = np.ones(d)
mu = x.mean(); sigma = np.sqrt(((x-mu)**2).mean())
x_hat = (x - mu) / sigma
check_close('J_LN @ 1 = 0', J_ln @ ones_vec, np.zeros(d), tol=1e-10)
check_close('J_LN @ x_hat = 0', J_ln @ x_hat, np.zeros(d), tol=1e-10)
# (d) Rank
rank_ln = la.matrix_rank(J_ln)
check_true(f'rank = d-2 = {d-2}', rank_ln == d-2)
print(f'\nRank = {rank_ln}: LayerNorm cannot distinguish inputs that differ')
print('by adding a constant (direction 1) or scaling (direction x_hat).')
print('\nTakeaway: LayerNorm removes mean and variance information — its Jacobian has rank d-2.')
print("Exercise 4 solution complete.")
Exercise 5 (★★): HVPs and Edge of Stability
MSE loss: , Hessian .
(a) Implement the HVP (matrix-free).
(b) Use power iteration (10 steps) to estimate .
(c) Compute the max stable learning rate .
(d) Run GD with (converges) and (diverges).
(e) Plot both loss curves and explain the edge of stability.
Code cell 17
# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")
Code cell 18
# Solution
# Exercise 5 - reference solution
np.random.seed(42)
m_data, n_feat = 40, 8
X = np.random.randn(m_data, n_feat)
true_w = np.random.randn(n_feat)
y = X @ true_w + 0.05*np.random.randn(m_data)
def mse_loss(w):
return 0.5 * np.mean((X @ w - y)**2)
def mse_grad(w):
return X.T @ (X @ w - y) / m_data
def hvp_mse(v):
"""HVP for MSE: H@v = X^T(Xv)/m."""
return X.T @ (X @ v) / m_data
def power_iter(hvp_fn, n, n_iters=20, seed=1):
np.random.seed(seed)
v = np.random.randn(n)
v = v / la.norm(v)
lam = 0.0
for _ in range(n_iters):
hv = hvp_fn(v)
lam = v @ hv
v = hv / la.norm(hv)
return lam
lam_max = power_iter(hvp_mse, n_feat)
lam_max_true = la.eigvalsh(X.T @ X / m_data)[-1]
eta_max = 2.0 / lam_max
header('Exercise 5: HVPs and Edge of Stability')
print(f'lambda_max (power iter): {lam_max:.6f}')
print(f'lambda_max (exact): {lam_max_true:.6f}')
check_close('Power iter matches exact', lam_max, lam_max_true, tol=0.01)
print(f'Max stable lr: eta_max = {eta_max:.6f}')
# (d) GD stable vs unstable
n_steps = 200
losses_stable, losses_unstable = [], []
w_stable = w_unstable = np.zeros(n_feat)
for _ in range(n_steps):
w_stable = w_stable - 0.9*eta_max * mse_grad(w_stable)
w_unstable = w_unstable - 1.1*eta_max * mse_grad(w_unstable)
losses_stable.append(mse_loss(w_stable))
losses_unstable.append(min(mse_loss(w_unstable), 1e8))
check_true('Stable GD converges', losses_stable[-1] < 0.01)
check_true('Unstable GD diverges', losses_unstable[-1] > 1e4)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.semilogy(losses_stable, 'g-', lw=2,
label=f'η=0.9×η_max (converges)')
ax.semilogy(losses_unstable, 'r-', lw=2,
label=f'η=1.1×η_max (diverges)')
ax.set_xlabel('Steps'); ax.set_ylabel('Loss (log)')
ax.set_title(f'Edge of Stability: η_max = {eta_max:.4f}')
ax.legend(); plt.tight_layout(); plt.show()
print('\nTakeaway: lambda_max determines the maximum stable learning rate. '
'Power iteration (20 HVPs) estimates it without forming H.')
print("Exercise 5 solution complete.")
Exercise 6 (★★): Newton's Method Convergence
Minimise (minimum at ).
(a) Compute and .
(b) Run 10 steps of Newton's method from .
(c) Run 100 steps of gradient descent with from .
(d) Plot both error sequences on a log scale and verify Newton's quadratic convergence.
(e) Compute the Newton decrement at each step.
Code cell 20
# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")
Code cell 21
# Solution
# Exercise 6 - reference solution
def sigmoid(x): return 1/(1+np.exp(-x))
def g(x): return np.log(1+np.exp(x)) - 2*x
def g_prime(x): return sigmoid(x) - 2
def g_double_prime(x): s=sigmoid(x); return s*(1-s)
x_star = np.log(3.0)
x0 = 5.0
# Newton's method
x = x0
errors_newton = [abs(x - x_star)]
decrements_newton = []
for _ in range(10):
decrement = np.sqrt(g_prime(x)**2 / g_double_prime(x))
decrements_newton.append(decrement)
x = x - g_prime(x) / g_double_prime(x) # Newton step
errors_newton.append(abs(x - x_star))
# Gradient descent
x_gd = x0
errors_gd = [abs(x_gd - x_star)]
eta_gd = 0.5
for _ in range(100):
x_gd = x_gd - eta_gd * g_prime(x_gd)
errors_gd.append(abs(x_gd - x_star))
header('Exercise 6: Newton vs GD Convergence')
print('Newton errors:')
for i, e in enumerate(errors_newton[:8]):
print(f' step {i}: {e:.3e}')
# Quadratic convergence check: log|e_{t+1}| ≈ 2 * log|e_t| near optimum
valid = [(errors_newton[i], errors_newton[i+1])
for i in range(len(errors_newton)-1)
if errors_newton[i] > 1e-12 and errors_newton[i+1] > 1e-12]
if len(valid) >= 2:
e_t, e_t1 = valid[-2]
ratio = np.log(e_t1+1e-20) / (np.log(e_t+1e-20) + 1e-20)
print(f'\nLog-error ratio (should be ~2 for quadratic convergence): {ratio:.2f}')
check_true('Quadratic convergence (ratio > 1.5)', ratio > 1.5)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.semilogy(errors_newton, 'b-o', ms=6, lw=2, label='Newton (10 steps)')
ax.semilogy(errors_gd[:20], 'r-', lw=2, label='GD η=0.5 (first 20 steps)')
ax.set_xlabel('Step'); ax.set_ylabel('|x_t - x*|')
ax.set_title('Newton (quadratic) vs GD (linear) Convergence')
ax.legend(); plt.tight_layout(); plt.show()
print('\nTakeaway: Newton converges in ~5 steps (quadratic). GD needs hundreds (linear).')
print("Exercise 6 solution complete.")
Exercise 7 (★★★): K-FAC Kronecker Factorisation
Linear layer , .
(a) Show that where .
(b) Compute the exact FIM block from 100 samples and the K-FAC approximation .
(c) Verify .
(d) Compute the K-FAC natural gradient step and verify it matches .
(e) Compare the computational cost: direct FIM inversion vs K-FAC inversion.
Code cell 23
# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")
Code cell 24
# Solution
# Exercise 7 - reference solution
n_in_k, n_out_k = 5, 4
n_samp_k = 150
np.random.seed(42)
activations_k = np.random.randn(n_samp_k, n_in_k)
back_grads_k = np.random.randn(n_samp_k, n_out_k) * 0.3
# (a) Verify vec(∇_W L) = x ⊗ g for one sample
a_i = activations_k[0]
g_i = back_grads_k[0]
grad_W_i = np.outer(g_i, a_i) # (n_out, n_in)
kron_ag = np.kron(a_i, g_i) # (n_in * n_out,)
vec_grad_W_i = grad_W_i.T.flatten() # column-major vec
header('Exercise 7: K-FAC')
print('(a) Verify vec(∇_W) = x ⊗ g:')
check_close('kron(a,g) = vec(∇_W)', kron_ag, vec_grad_W_i)
# (b) Exact FIM block
dim_fim = n_in_k * n_out_k
fim_exact_k = np.zeros((dim_fim, dim_fim))
for i in range(n_samp_k):
a = activations_k[i]; g = back_grads_k[i]
v = np.kron(a, g)
fim_exact_k += np.outer(v, v)
fim_exact_k /= n_samp_k
# K-FAC approximation
A_k = activations_k.T @ activations_k / n_samp_k
G_k = back_grads_k.T @ back_grads_k / n_samp_k
fim_kfac_k = np.kron(A_k, G_k)
rel_err_k = la.norm(fim_exact_k - fim_kfac_k) / la.norm(fim_exact_k)
print(f'\n(b) FIM shape: {fim_exact_k.shape}')
print(f'K-FAC relative error: {rel_err_k:.4f}')
# (c) (A⊗G)^{-1} = A^{-1} ⊗ G^{-1}
dam = 1e-3
A_inv_k = la.inv(A_k + dam*np.eye(n_in_k))
G_inv_k = la.inv(G_k + dam*np.eye(n_out_k))
kfac_inv_k = np.kron(A_inv_k, G_inv_k)
fim_kfac_damped = fim_kfac_k + dam*np.eye(dim_fim)
direct_inv_k = la.inv(fim_kfac_damped)
check_close('(c) (A⊗G)^-1 = A^-1⊗G^-1', kfac_inv_k, direct_inv_k, tol=1e-5)
# (d) K-FAC natural gradient step
np.random.seed(7)
grad_W_test = np.random.randn(n_out_k, n_in_k)
# Matrix form
delta_W_matrix = G_inv_k @ grad_W_test @ A_inv_k
# Vec form: (A^-1 ⊗ G^-1) vec(∇W)
vec_grad = grad_W_test.T.flatten()
delta_W_vec = (kfac_inv_k @ vec_grad).reshape(n_in_k, n_out_k).T
check_close('(d) matrix form = vec form', delta_W_matrix, delta_W_vec, tol=1e-8)
# (e) Cost comparison
print(f'\n(e) Computational cost comparison:')
print(f' Direct FIM inversion: O((n_in*n_out)^3) = O({(n_in_k*n_out_k)**3})')
print(f' K-FAC inversion: O(n_in^3 + n_out^3) = O({n_in_k**3 + n_out_k**3})')
speedup = (n_in_k*n_out_k)**3 / (n_in_k**3 + n_out_k**3)
print(f' Speedup factor: {speedup:.1f}x (grows as min(m,n)^2)')
print('\nTakeaway: K-FAC uses Kronecker structure to reduce inversion from O(n^6) to O(n^3).')
print("Exercise 7 solution complete.")
Exercise 8 (★★★): Jacobian Determinant in Normalising Flows
RealNVP coupling layer: , .
(a) Show the Jacobian is block lower-triangular.
(b) Show .
(c) Verify numerically for a specific .
(d) Implement the inverse and verify .
(e) Explain why triangular Jacobians are essential for scalable generative modelling.
Code cell 26
# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")
Code cell 27
# Solution
# Exercise 8 - reference solution
W_s = np.array([[0.5, -0.2], [0.3, 0.1]])
W_t = np.array([[0.1, 0.0], [0.0, -0.2]])
def s_net(x1): return W_s @ x1
def t_net(x1): return W_t @ x1
def coupling_fwd(x):
x1, x2 = x[:2], x[2:]
s, t = s_net(x1), t_net(x1)
return np.concatenate([x1, x2 * np.exp(s) + t])
def coupling_inv(y):
y1, y2 = y[:2], y[2:]
x1 = y1.copy()
s, t = s_net(x1), t_net(x1)
x2 = (y2 - t) * np.exp(-s)
return np.concatenate([x1, x2])
x_test = np.array([1.0, 0.5, -0.3, 2.0])
header('Exercise 8: Normalising Flow Jacobian')
# (b)+(c) Log-det verification
y_test = coupling_fwd(x_test)
J_coupling = jacobian_fd(coupling_fwd, x_test)
print('(a)+(b) Jacobian (should be lower-triangular):')
print(J_coupling.round(4))
upper_right = J_coupling[:2, 2:]
check_close('Upper-right block = 0 (triangular)', upper_right, np.zeros((2,2)), tol=1e-8)
log_det_fd = np.log(np.abs(la.det(J_coupling)))
s_at_x1 = s_net(x_test[:2])
log_det_formula = np.sum(s_at_x1)
print(f'\n(c) log|det J| (direct from J): {log_det_fd:.6f}')
print(f' log|det J| (formula sum(s)): {log_det_formula:.6f}')
check_close('log-det formula correct', log_det_fd, log_det_formula, tol=1e-6)
# (d) Invertibility
x_rec = coupling_inv(y_test)
check_close('(d) f^-1(f(x)) = x', x_rec, x_test, tol=1e-10)
# (e) Scalability argument
print('\n(e) Why triangular Jacobian matters:')
print(' For d=1000: direct det(J) = O(d^3) = O(10^9) operations')
print(' Triangular: det = product of diagonals = O(d) operations')
print(' Training NF via MLE: needs log|det J| at every step => must be O(d)')
print('\nTakeaway: Coupling layers make log|det J| = sum(s(x1)) tractable — '
'from O(d^3) to O(d). This is the key design choice in RealNVP and Glow.')
print("Exercise 8 solution complete.")
Exercise 9 (★★★): Hessian-Vector Products Without Forming the Hessian
For
compute by finite differences of gradients and compare with the exact Hessian-vector product.
Code cell 29
# Your Solution
# Exercise 9 - learner workspace
# Compare finite-difference HVP with an analytic HVP.
print("Learner workspace ready for Exercise 9.")
Code cell 30
# Solution
# Exercise 9 - Hessian-vector products
header("Exercise 9: Hessian-vector product")
A = np.array([[1.0, -0.5], [0.3, 0.8], [-1.2, 0.4]])
x = np.array([0.2, -0.7])
v = np.array([0.6, -0.1])
def grad_f(x):
s = sigmoid(A @ x)
return A.T @ s
def hvp_exact(x, v):
s = sigmoid(A @ x)
weights = s * (1 - s)
return A.T @ (weights * (A @ v))
def hvp_fd(x, v, h=1e-5):
return (grad_f(x + h*v) - grad_f(x - h*v)) / (2*h)
exact = hvp_exact(x, v)
fd = hvp_fd(x, v)
print("exact HVP:", exact)
print("finite-difference HVP:", fd)
check_close("HVP match", exact, fd, tol=1e-6)
print("Takeaway: second-order optimizers often need HVPs, not explicit Hessian matrices.")
Exercise 10 (★★★): Gauss-Newton Approximation for Least Squares
For residuals and loss , the Gauss-Newton matrix is .
Compute , , and compare a Gauss-Newton step with a gradient descent step.
Code cell 32
# Your Solution
# Exercise 10 - learner workspace
# Build residuals, Jacobian, and compare update directions.
print("Learner workspace ready for Exercise 10.")
Code cell 33
# Solution
# Exercise 10 - Gauss-Newton step
header("Exercise 10: Gauss-Newton least squares")
xs = np.array([-1.0, 0.0, 1.0, 2.0])
y_obs = np.array([0.2, 1.0, 2.7, 5.1])
theta = np.array([1.0, 0.5]) # model theta0 + theta1*x^2
def residual(theta):
return theta[0] + theta[1] * xs**2 - y_obs
def jac(theta):
return np.column_stack([np.ones_like(xs), xs**2])
r = residual(theta)
J = jac(theta)
grad = J.T @ r
G = J.T @ J
step_gn = -la.solve(G + 1e-8*np.eye(2), grad)
step_gd = -0.1 * grad
print("gradient:", grad)
print("Gauss-Newton step:", step_gn)
print("gradient descent step:", step_gd)
check_true("Gauss-Newton decreases loss", 0.5*la.norm(residual(theta+step_gn))**2 < 0.5*la.norm(r)**2)
print("Takeaway: Gauss-Newton uses Jacobian geometry to scale residual-fitting updates.")
What to Review After Finishing
- Exercise 1 — Jacobian rows = output gradients, columns = input sensitivities
- Exercise 2 — Softmax Jacobian is , rank
- Exercise 3 — Hessian eigenvalues classify critical points geometrically
- Exercise 4 — LayerNorm removes 2 degrees of freedom, rank
- Exercise 5 — HVPs enable estimation without forming
- Exercise 6 — Newton converges quadratically vs linear for GD
- Exercise 7 — K-FAC: Kronecker structure reduces inversion from to
- Exercise 8 — Triangular Jacobians make tractable in
References
- Pearlmutter (1994) — Fast Hessian-vector products
- Martens & Grosse (2015) — K-FAC
- Foret et al. (2021) — SAM
- Dinh et al. (2017) — RealNVP (coupling layers)
- Cohen et al. (2022) — Edge of stability