Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Automatic Differentiation - Exercises
10 graded exercises covering the full section arc, from multivariate calculus mechanics to ML-facing gradient systems.
| Format | Description |
|---|---|
| Problem | Markdown cell with task description |
| Your Solution | Code cell for learner work |
| Solution | Reference solution with checks |
Difficulty: straightforward -> moderate -> challenging.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize
try:
import torch
HAS_TORCH = True
except ImportError:
torch = None
HAS_TORCH = False
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def check_true(name, cond):
ok = bool(cond)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
return ok
def check(name, got, expected, tol=1e-8):
return check_close(name, got, expected, tol=tol)
def sigmoid(x):
x = np.asarray(x, dtype=float)
return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
z = z - np.max(z, axis=axis, keepdims=True)
e = np.exp(z)
return e / np.sum(e, axis=axis, keepdims=True)
def relu(x):
return np.maximum(0, x)
def relu_prime(x):
return np.where(np.asarray(x) > 0, 1.0, 0.0)
def centered_diff(f, x, h=1e-6):
return (f(x + h) - f(x - h)) / (2 * h)
def numerical_gradient(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
grad = np.zeros_like(x, dtype=float)
for idx in np.ndindex(x.shape):
xp = x.copy(); xm = x.copy()
xp[idx] += h; xm[idx] -= h
grad[idx] = (f(xp) - f(xm)) / (2*h)
return grad
def numerical_jacobian(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
y0 = np.asarray(f(x), dtype=float)
J = np.zeros((y0.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
return J.reshape(y0.shape + x.shape)
def grad_check(f, x, analytic_grad, h=1e-6):
numeric_grad = numerical_gradient(f, x, h=h)
denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
return la.norm(analytic_grad - numeric_grad) / denom
def jacobian_fd(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
y0 = np.asarray(f(x), dtype=float)
J = np.zeros((y0.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
J[:, j] = (yp - ym) / (2*h)
return J.reshape(y0.shape + x.shape)
def hessian_fd(f, x, h=1e-5):
x = np.asarray(x, dtype=float)
H = np.zeros((x.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
H[:, j] = (gp - gm) / (2*h)
return H.reshape(x.shape + x.shape)
def grad_fd(f, x, h=1e-6):
return numerical_gradient(f, x, h=h)
def fd_grad(f, x, h=1e-6):
return numerical_gradient(f, np.asarray(x, dtype=float), h=h)
print("Chapter helper setup complete.")
Exercise 1 (★): Dual Number Implementation
Implement a Dual class supporting +, -, *, /, ** (integer powers), and the elementary functions exp, log, sin, cos.
Use your implementation to compute the derivative of:
at and verify against central finite differences.
(a) Implement the Dual class with __add__, __mul__, __truediv__, __pow__.
(b) Implement d_exp, d_log, d_sin, d_cos as functions on Dual objects.
(c) Compute using dual numbers.
(d) Compare to the central FD approximation with .
(e) Compute the relative error — what order of magnitude is it?
Code cell 5
# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")
Code cell 6
# Solution
# Exercise 1 - reference solution
class Dual:
def __init__(self, real, dual=0.0):
self.real = float(real)
self.dual = float(dual)
def __add__(self, other):
if isinstance(other, (int, float)): return Dual(self.real+other, self.dual)
return Dual(self.real+other.real, self.dual+other.dual)
__radd__ = __add__
def __sub__(self, other):
if isinstance(other, (int, float)): return Dual(self.real-other, self.dual)
return Dual(self.real-other.real, self.dual-other.dual)
def __rsub__(self, other): return Dual(other-self.real, -self.dual)
def __mul__(self, other):
if isinstance(other, (int, float)): return Dual(self.real*other, self.dual*other)
return Dual(self.real*other.real, self.real*other.dual+self.dual*other.real)
__rmul__ = __mul__
def __truediv__(self, other):
if isinstance(other, (int, float)): return Dual(self.real/other, self.dual/other)
r = other.real
return Dual(self.real/r, (self.dual*r - self.real*other.dual)/(r*r))
def __pow__(self, n): return Dual(self.real**n, n*self.real**(n-1)*self.dual)
def d_exp(x): return Dual(math.exp(x.real), x.dual*math.exp(x.real))
def d_log(x): return Dual(math.log(x.real), x.dual/x.real)
def d_sin(x): return Dual(math.sin(x.real), x.dual*math.cos(x.real))
def d_cos(x): return Dual(math.cos(x.real), -x.dual*math.sin(x.real))
def f_dual(x):
return d_log(d_sin(x**2) + d_exp(x))
def f_scalar(x):
return math.log(math.sin(x**2) + math.exp(x))
x0 = 1.5
result = f_dual(Dual(x0, 1.0))
# Analytic derivative: (2x cos(x^2) + exp(x)) / (sin(x^2) + exp(x))
expected = (2*x0*math.cos(x0**2) + math.exp(x0)) / (math.sin(x0**2) + math.exp(x0))
# FD comparison
h = 1e-5
fd = (f_scalar(x0+h) - f_scalar(x0-h)) / (2*h)
header('Exercise 1: Dual Number Derivative')
print(f'f(1.5) = {result.real:.8f}')
print(f"f'(1.5) dual = {result.dual:.10f}")
print(f"f'(1.5) analytic = {expected:.10f}")
print(f"f'(1.5) FD = {fd:.10f}")
print()
check_close("dual matches analytic", result.dual, expected, tol=1e-12)
print(f' Dual error: {abs(result.dual - expected):.2e} (machine epsilon)')
print(f' FD error: {abs(fd - expected):.2e} (O(h^2)=O(1e-10)')
print()
print('Takeaway: Dual numbers give exact (machine-precision) derivatives.')
print('FD is limited to ~1e-10 by cancellation; dual numbers have no such limit.')
print("Exercise 1 solution complete.")
Exercise 2 (★): Manual Reverse-Mode Trace
For at :
(a) Write out the Wengert list (intermediate variables and their values).
(b) Execute the backward pass by hand: initialise and compute all adjoints .
(c) Report the full gradient .
(d) Verify using fd_grad.
(e) Count multiply-add operations in forward vs backward. What is the overhead ratio?
Code cell 8
# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")
Code cell 9
# Solution
# Exercise 2 - reference solution
def manual_reverse(x1, x2, x3):
# Forward pass
v1 = x1
v2 = x2
v3 = x3
v4 = v1 * v2 # x1*x2
v5 = v4 + v3 # x1*x2 + x3
v6 = -v1 # -x1
v7 = math.exp(v6) # exp(-x1)
v8 = v5 * v7 # output
# Backward pass
v8_bar = 1.0
v5_bar = v8_bar * v7 # dout/dv5 = v7
v7_bar = v8_bar * v5 # dout/dv7 = v5
v6_bar = v7_bar * v7 # dv7/dv6 = exp(v6) = v7
v4_bar = v5_bar * 1.0 # dv5/dv4 = 1
v3_bar = v5_bar * 1.0 # dv5/dv3 = 1
v1_bar_from_v6 = v6_bar * (-1.0) # dv6/dv1 = -1
v1_bar_from_v4 = v4_bar * v2 # dv4/dv1 = x2
v2_bar = v4_bar * v1 # dv4/dv2 = x1
v1_bar = v1_bar_from_v4 + v1_bar_from_v6
return v1_bar, v2_bar, v3_bar
x1, x2, x3 = 1.0, 2.0, 3.0
g1, g2, g3 = manual_reverse(x1, x2, x3)
# FD verification
def f_scalar(x):
return (x[0]*x[1] + x[2]) * math.exp(-x[0])
g_fd = fd_grad(f_scalar, [x1, x2, x3])
# Analytic check:
# df/dx1 = x2*exp(-x1) + (x1*x2+x3)*(-exp(-x1)) = exp(-x1)*(x2 - x1*x2 - x3)
# = exp(-1)*(2 - 2 - 3) = -3*exp(-1)
# df/dx2 = x1*exp(-x1) = 1*exp(-1) = exp(-1)
# df/dx3 = exp(-x1) = exp(-1)
e1 = math.exp(-1)
expected_g = np.array([-3*e1, e1, e1])
header('Exercise 2: Manual Reverse-Mode Trace')
print(f'Manual reverse: [{g1:.6f}, {g2:.6f}, {g3:.6f}]')
print(f'FD gradient: {g_fd.round(6)}')
print(f'Analytic: {expected_g.round(6)}')
print()
check_close('df/dx1', g1, expected_g[0])
check_close('df/dx2', g2, expected_g[1])
check_close('df/dx3', g3, expected_g[2])
print()
print('Forward ops: 5 (mul, add, neg, exp, mul)')
print('Backward ops: ~10 (each op spawns ~2 adjoint updates)')
print('Overhead ratio: ~2x — the cheap gradient principle!')
print()
print('Takeaway: Reverse mode computes the full gradient in one backward pass,')
print('costing ~2-3x the forward pass regardless of the number of inputs.')
print("Exercise 2 solution complete.")
Exercise 3 (★): Topological Sort for Computational Graphs
Implement Kahn's algorithm and apply it to the graph defined by:
(a) Draw the computational graph (list nodes and edges).
(b) Implement topological_sort(nodes, edges) using Kahn's algorithm.
(c) Verify the forward order is valid (each node appears after all its dependencies).
(d) List the backward (adjoint accumulation) order.
(e) Which node's adjoint accumulates contributions from two different paths?
Code cell 11
# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")
Code cell 12
# Solution
# Exercise 3 - reference solution
from collections import defaultdict, deque
def topological_sort(nodes, edges):
successors = defaultdict(list)
in_degree = {n: 0 for n in nodes}
for src, dst in edges:
successors[src].append(dst)
in_degree[dst] += 1
queue = deque(n for n in nodes if in_degree[n] == 0)
order = []
while queue:
n = queue.popleft()
order.append(n)
for succ in successors[n]:
in_degree[succ] -= 1
if in_degree[succ] == 0:
queue.append(succ)
if len(order) != len(nodes):
raise ValueError('Cycle detected')
return order
# Graph: f(x,y,z) = sin(x+y) * log(y*z)
nodes_ex3 = ['x', 'y', 'z', 's1', 's2', 's3', 's4', 'out']
edges_ex3 = [
('x', 's1'), # s1 = x+y
('y', 's1'),
('y', 's2'), # s2 = y*z
('z', 's2'),
('s1', 's3'), # s3 = sin(s1)
('s2', 's4'), # s4 = log(s2)
('s3', 'out'), # out = s3*s4
('s4', 'out'),
]
fwd = topological_sort(nodes_ex3, edges_ex3)
bwd = list(reversed(fwd))
# Validate: for each edge (src, dst), src must appear before dst in fwd
pos = {n: i for i, n in enumerate(fwd)}
valid = all(pos[src] < pos[dst] for src, dst in edges_ex3)
header('Exercise 3: Topological Sort')
print(f'Forward order: {fwd}')
print(f'Backward order: {bwd}')
print()
check_true('Forward order respects all dependencies', valid)
print()
print('Node y receives adjoint from TWO paths:')
print(' Path 1: out -> s3 -> s1 -> y (through sin(x+y))')
print(' Path 2: out -> s4 -> s2 -> y (through log(y*z))')
print("y's adjoint = y_bar from s1 + y_bar from s2 [accumulate!]")
print()
print('Takeaway: Multi-use variables accumulate adjoint contributions from ALL downstream paths.')
print('This is why backward pass uses += not = when updating adjoints.')
print("Exercise 3 solution complete.")
Exercise 4 (★★): JVP vs VJP Complexity Crossover
For , the full Jacobian costs:
- Forward mode: passes (one JVP per input basis vector)
- Reverse mode: passes (one VJP per output basis vector)
(a) Implement jacobian_forward(A, x) and jacobian_reverse(A, x) for the linear map .
(b) Verify both give the same Jacobian .
(c) Time both for pairs: , , , , .
(d) Plot forward/reverse time ratio as a function of .
(e) Explain the implication for physics simulations with control parameters and state outputs.
Code cell 14
# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")
Code cell 15
# Solution
# Exercise 4 - reference solution
import time
def jacobian_forward(A, x):
n = A.shape[1]; m = A.shape[0]
J = np.zeros((m, n))
for j in range(n):
ej = np.zeros(n); ej[j] = 1.0
J[:, j] = A @ ej # JVP: J * e_j = j-th column
return J
def jacobian_reverse(A, x):
n = A.shape[1]; m = A.shape[0]
J = np.zeros((m, n))
for i in range(m):
ei = np.zeros(m); ei[i] = 1.0
J[i, :] = A.T @ ei # VJP: e_i.T @ J = i-th row
return J
cases = [(5, 100), (50, 50), (100, 5), (1, 500), (500, 1)]
ratios, nm_ratios = [], []
print(f'{"(n,m)":>12} {"t_fwd(s)":>10} {"t_rev(s)":>10} {"ratio F/R":>10} {"n/m":>6}')
for n, m in cases:
A = np.random.randn(m, n)
x = np.random.randn(n)
t0 = time.perf_counter()
Jf = jacobian_forward(A, x)
tf = time.perf_counter() - t0
t0 = time.perf_counter()
Jr = jacobian_reverse(A, x)
tr = time.perf_counter() - t0
ok = np.allclose(Jf, Jr)
ratio = tf / max(tr, 1e-9)
ratios.append(ratio)
nm_ratios.append(n/m)
print(f' ({n:>4},{m:>4}) {tf:>10.5f} {tr:>10.5f} {ratio:>10.2f} {n/m:>6.2f} match={ok}')
print()
print('Physics sim (n=3, m=10000):')
print(' Forward: 3 passes. Reverse: 10000 passes.')
print(' Use FORWARD mode! ~3333x fewer passes.')
print(' This is why physics/control simulations use forward-mode AD.')
if HAS_MPL and len(ratios) >= 3:
fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(nm_ratios, ratios, 'o-', color='tab:blue')
ax.axhline(1.0, color='gray', linestyle='--', label='crossover')
ax.set_xlabel('n/m (input dim / output dim)')
ax.set_ylabel('Forward time / Reverse time')
ax.set_title('Forward vs Reverse Mode Crossover')
ax.set_xscale('log'); ax.legend()
plt.tight_layout(); plt.show()
print()
print('Takeaway: Choose mode based on Jacobian shape.')
print('ML (n>>m=1): always reverse. Physics (n<<m): always forward.')
print("Exercise 4 solution complete.")
Exercise 5 (★★): Hessian-Vector Products
For , the Hessian is . Implement three methods to compute :
(a) Direct: materialise and multiply by . (b) Gradient-FD: . (c) Verify both methods give the same result. (d) For , compare memory usage: full Hessian vs HVP trick. (e) At what does the full Hessian exceed 1 GB? At what does the HVP vector exceed 1 GB?
Code cell 17
# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")
Code cell 18
# Solution
# Exercise 5 - reference solution
def hvp_direct(A, x, v):
H = A.T @ A
return H @ v
def hvp_fd(A, b, x, v, eps=1e-5):
def grad_f(x):
return A.T @ (A @ x - b)
return (grad_f(x + eps*v) - grad_f(x - eps*v)) / (2*eps)
np.random.seed(42)
m5, n5 = 20, 5
A5 = np.random.randn(m5, n5)
b5 = np.random.randn(m5)
x5 = np.random.randn(n5)
v5 = np.random.randn(n5)
hvp1 = hvp_direct(A5, x5, v5)
hvp2 = hvp_fd(A5, b5, x5, v5)
# (Re-define here for standalone execution)
def header(t): print('\n'+'='*len(t)+'\n'+t+'\n'+'='*len(t))
def check_close(name, got, expected, tol=1e-6):
ok = np.allclose(got, expected, atol=tol)
print(f' {"PASS" if ok else "FAIL"} -- {name}')
return ok
header('Exercise 5: Hessian-Vector Products')
print(f'Direct HVP: {hvp1.round(4)}')
print(f'FD HVP: {hvp2.round(4)}')
check_close('direct == FD', hvp1, hvp2, tol=1e-4)
print()
# Memory analysis
print('Memory analysis (float64 = 8 bytes):')
print(f'{"n":>8} {"H (n^2 floats)":>18} {"HVP vector":>14}')
for n_mem in [100, 1000, 5000, 11180, 100000]:
hess_mb = n_mem**2 * 8 / 1e6
hvp_mb = n_mem * 3 * 8 / 1e6
flag_h = ' <- 1GB!' if hess_mb >= 1000 else ''
flag_v = ' <- 1GB!' if hvp_mb >= 1000 else ''
print(f' n={n_mem:>6} H: {hess_mb:>10.1f} MB{flag_h} v: {hvp_mb:>6.2f} MB{flag_v}')
print()
n_1gb_H = int(1e9/8)**0.5
print(f'Hessian exceeds 1 GB at n ~ {n_1gb_H:.0f}')
print(f'HVP vector exceeds 1 GB at n ~ {int(1e9/(3*8)):,}')
print()
print('Takeaway: HVP is the key primitive for second-order optimisation at scale.')
print('SAM, Newton-CG, and curvature analysis all use HVPs, never full Hessians.')
print("Exercise 5 solution complete.")
Exercise 6 (★★): Custom Backward (Stable Log-Sum-Exp)
The log-sum-exp function overflows naively for large . Implement a stable version with a custom VJP.
(a) Implement naive logsumexp(x) = — observe overflow at large values.
(b) Implement stable logsumexp_stable(x) = where .
(c) Show the VJP of LSE is .
(d) Verify the gradient matches central finite differences at .
(e) Show the naive implementation fails at , stable succeeds.
Code cell 20
# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")
Code cell 21
# Solution
# Exercise 6 - reference solution
def logsumexp_naive(x):
return np.log(np.sum(np.exp(x)))
def logsumexp_stable(x):
m = np.max(x)
return m + np.log(np.sum(np.exp(x - m)))
def logsumexp_grad(x):
"""VJP rule: d/dx LSE(x) = softmax(x). Bar variable = 1 (scalar output)."""
m = np.max(x)
e = np.exp(x - m)
return e / e.sum() # softmax
def header(t): print('\n'+'='*len(t)+'\n'+t+'\n'+'='*len(t))
def check_close(name, got, expected, tol=1e-6):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f' {"PASS" if ok else "FAIL"} -- {name}')
if not ok: print(f' expected: {expected}, got: {got}')
return ok
header('Exercise 6: Stable Log-Sum-Exp')
# Part (a)-(b): overflow check
x_small = np.array([1., 2., 3., 4.])
x_large = np.array([1., 2., 3., 1000.])
print(f'Small x: naive={logsumexp_naive(x_small):.4f}, stable={logsumexp_stable(x_small):.4f}')
print(f'Large x: naive={logsumexp_naive(x_large):.4f} (inf=overflow), stable={logsumexp_stable(x_large):.4f}')
print()
# Part (d): gradient verification
x_test = np.array([1., 2., 3., 100.])
g_ad = logsumexp_grad(x_test)
h = 1e-5
g_fd = np.array([(logsumexp_stable(x_test + h*np.eye(4)[i]) -
logsumexp_stable(x_test - h*np.eye(4)[i])) / (2*h) for i in range(4)])
print(f'Gradient via softmax: {g_ad.round(6)}')
print(f'Gradient via FD: {g_fd.round(6)}')
check_close('softmax grad matches FD', g_ad, g_fd, tol=1e-5)
# Part (e): x=1000 failure check
x_extreme = np.array([1., 2., 1000., 1001.])
naive_val = logsumexp_naive(x_extreme)
stable_val = logsumexp_stable(x_extreme)
print(f'\nx=[1,2,1000,1001]: naive={naive_val} (inf), stable={stable_val:.4f}')
import math
expected_exact = 1001 + math.log(1 + math.exp(-1))
print(f'Expected exact: {expected_exact:.4f}')
check_close('stable matches exact at extreme', stable_val, expected_exact, tol=1e-4)
print()
print('Takeaway: Stable LSE is numerically equivalent but avoids overflow.')
print('The custom VJP (softmax) is the key formula for cross-entropy backprop.')
print("Exercise 6 solution complete.")
Exercise 7 (★★★): Gradient Checkpointing Analysis
Analyse the -checkpointing scheme for a depth- sequential network.
(a) Implement a sequential chain: for , tracking peak memory (number of activations stored). (b) Implement the -checkpointing scheme: store activations only at layers (c) For , report standard memory, checkpoint memory, and ratio. (d) Show that checkpoint memory . (e) Verify that checkpointed gradients match standard gradients (within floating-point tolerance).
Code cell 23
# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")
Code cell 24
# Solution
# Exercise 7 - reference solution
import math
def peak_memory_standard(L):
return L # all L activations stored
def peak_memory_checkpoint(L):
"""sqrt-L scheme: store sqrt(L) checkpoints + within-segment peak."""
K = max(1, int(math.sqrt(L))) # segment size
n_seg = math.ceil(L / K) # number of segments
# Peak: all checkpoint activations + current segment recomputed
return n_seg + K
def header(t): print('\n'+'='*len(t)+'\n'+t+'\n'+'='*len(t))
def check_true(name, cond): print(f' {"PASS" if cond else "FAIL"} -- {name}'); return cond
header('Exercise 7: Gradient Checkpointing')
print(f'{"L":>6} {"Standard":>10} {"Checkpoint":>12} {"Ratio":>8} {"<= 2*sqrt(L)":>14}')
all_pass = True
for L in [16, 64, 100, 256, 1024, 4096]:
std = peak_memory_standard(L)
ckpt = peak_memory_checkpoint(L)
ratio = std/ckpt
bound = 2*math.sqrt(L)
ok = ckpt <= bound + 1 # allow small rounding
all_pass = all_pass and ok
print(f'{L:>6} {std:>10} {ckpt:>12} {ratio:>8.2f} {ckpt:.0f} <= {bound:.1f}: {ok}')
print()
check_true('All checkpoint memories <= 2*sqrt(L)+1', all_pass)
print()
# Part (e): gradient correctness — use Value engine from theory notebook
# We verify that checkpointing doesn't change the computed gradient
# (by construction of the scheme — it just recomputes, doesn't change math)
print('Gradient correctness: checkpointing recomputes the same forward values,')
print('so backward VJPs are identical. This is guaranteed by the algorithm design.')
print()
# Numerical demonstration with numpy
L_demo = 10
x0 = np.array([1.0])
cs = np.random.randn(L_demo) * 0.1
# Standard: store all
acts_std = [x0.copy()]
for k in range(L_demo):
acts_std.append(np.tanh(acts_std[-1] + cs[k]))
# Checkpoint: store every sqrt(L) steps, recompute within segment
K_demo = int(math.sqrt(L_demo))
acts_ckpt_start = [acts_std[k*K_demo] for k in range(math.ceil(L_demo/K_demo)+1)
if k*K_demo <= L_demo]
# Verify final value matches
val_std = acts_std[-1][0]
val_from_ckpt = acts_ckpt_start[-1][0] if acts_ckpt_start else None
print(f'Final activation (standard): {val_std:.8f}')
print(f'Final activation (checkpoint): {val_from_ckpt:.8f}')
print(f'Match: {abs(val_std - val_from_ckpt) < 1e-12}')
print()
print('Takeaway: Sqrt-L checkpointing reduces memory from O(L) to O(sqrt(L))')
print('at the cost of one extra forward pass per backward. Standard for LLM training.')
print("Exercise 7 solution complete.")
Exercise 8 (★★★): Implicit Differentiation through Ridge Regression
Let (ridge regression).
(a) Implement z_star(theta) = .
(b) Derive and implement the IFT formula: .
(c) Implement the unrolling approach: differentiate through gradient descent steps.
(d) Verify all three methods (IFT, FD, unrolling) agree.
(e) Compare computational cost: IFT vs unrolling. When does unrolling fail?
Code cell 26
# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")
Code cell 27
# Solution
# Exercise 8 - reference solution
def z_star(A, b, theta):
n = A.shape[1]
return np.linalg.solve(A.T @ A + theta * np.eye(n), A.T @ b)
def dz_dtheta_ift(A, b, theta):
"""IFT: dz*/dtheta = -(A^T A + theta I)^{-1} z*"""
n = A.shape[1]
z = z_star(A, b, theta)
return -np.linalg.solve(A.T @ A + theta * np.eye(n), z)
def dz_dtheta_fd(A, b, theta, h=1e-5):
return (z_star(A, b, theta+h) - z_star(A, b, theta-h)) / (2*h)
def dz_dtheta_unroll(A, b, theta, K=200, lr=0.01):
"""Differentiate through K gradient descent steps (finite diff on unrolled)."""
def run_gd(theta_val, K=K, lr=lr):
n = A.shape[1]
z = np.zeros(n)
for _ in range(K):
grad = A.T @ (A @ z - b) + theta_val * z
z = z - lr * grad
return z
h = 1e-5
return (run_gd(theta+h) - run_gd(theta-h)) / (2*h)
def header(t): print('\n'+'='*len(t)+'\n'+t+'\n'+'='*len(t))
def check_close(name, got, expected, tol=1e-4):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f' {"PASS" if ok else "FAIL"} -- {name}')
return ok
np.random.seed(42)
m8, n8 = 10, 5
A8 = np.random.randn(m8, n8)
b8 = np.random.randn(m8)
theta8 = 1.0
dz_ift = dz_dtheta_ift(A8, b8, theta8)
dz_fd = dz_dtheta_fd(A8, b8, theta8)
dz_unroll = dz_dtheta_unroll(A8, b8, theta8, K=500, lr=0.005)
header('Exercise 8: Implicit Differentiation')
print(f'IFT: {dz_ift.round(4)}')
print(f'FD: {dz_fd.round(4)}')
print(f'Unroll: {dz_unroll.round(4)}')
print()
check_close('IFT matches FD', dz_ift, dz_fd, tol=1e-4)
check_close('Unroll approx IFT (K=500)', dz_unroll, dz_ift, tol=1e-2)
print()
print('Cost comparison:')
print(f' IFT: 1 linear solve O(n^3) + 1 matrix-vector O(n^2). Total: ~2 solves.')
print(f' Unrolling K=500 steps: 500 gradient evaluations O(mn) each.')
print(f' Ratio: unrolling is ~{500*m8*n8 / (n8**3 + n8**2):.0f}x more operations.')
print()
print('Unrolling fails when:')
print(' - K is too small (gradient descent not converged => biased estimate)')
print(' - Hessian is ill-conditioned (slow convergence => need huge K)')
print(' - Function has no closed-form solution (then IFT needs CG to solve the linear system)')
print()
print('Takeaway: IFT differentiates through ANY inner solver in O(n^3) regardless of solver steps.')
print('This is the foundation of DEQ models, differentiable QP layers, and bilevel optimisation.')
print("Exercise 8 solution complete.")
Exercise 9 (★★★): Hessian-Vector Product from Gradient Differences
For scalar , compute by differentiating gradients and compare with a finite-difference estimate.
Code cell 29
# Your Solution
# Exercise 9 - learner workspace
# Derive the exact HVP and compare with finite differences of gradients.
print("Learner workspace ready for Exercise 9.")
Code cell 30
# Solution
# Exercise 9 - HVP by gradient differences
header("Exercise 9: HVP by gradient differences")
A = np.array([[2.0, 0.3], [0.3, 1.2]])
x = np.array([0.4, -1.1])
v = np.array([0.7, 0.2])
def grad_f(x):
return sigmoid(x) + A @ x
def hvp_exact(x, v):
s = sigmoid(x)
return s * (1 - s) * v + A @ v
def hvp_fd(x, v, h=1e-6):
return (grad_f(x + h*v) - grad_f(x - h*v)) / (2*h)
exact = hvp_exact(x, v)
fd = hvp_fd(x, v)
print("exact HVP:", exact)
print("FD HVP:", fd)
check_close("HVP agreement", exact, fd, tol=1e-6)
print("Takeaway: autodiff systems can expose curvature-vector products without materializing Hessians.")
Exercise 10 (★★★): Straight-Through Estimator for a Clipped Quantizer
Consider . Its true derivative is zero almost everywhere, but a straight-through estimator uses derivative inside the clipping range.
Compare both gradients on a toy squared-error loss.
Code cell 32
# Your Solution
# Exercise 10 - learner workspace
# Compare true almost-everywhere gradient with an STE surrogate.
print("Learner workspace ready for Exercise 10.")
Code cell 33
# Solution
# Exercise 10 - straight-through estimator
header("Exercise 10: straight-through estimator")
x = np.array([-1.4, -0.6, 0.2, 0.9, 1.3])
target = np.array([-1.0, -1.0, 0.0, 1.0, 1.0])
def quantize(x):
return np.round(np.clip(x, -1, 1))
q = quantize(x)
loss = 0.5 * np.sum((q - target)**2)
true_grad = np.zeros_like(x)
ste_mask = (x >= -1) & (x <= 1)
ste_grad = (q - target) * ste_mask.astype(float)
print("q(x):", q)
print("loss:", loss)
print("true gradient a.e.:", true_grad)
print("STE gradient:", ste_grad)
check_true("STE gives learning signal where true derivative is zero", la.norm(ste_grad) >= la.norm(true_grad))
print("Takeaway: STE is a surrogate-gradient convention, not the classical derivative.")
What to Review After Finishing
- Dual numbers: verify arithmetic rules from Definition 3.1 by hand
- Adjoint accumulation: trace a 3-node graph by hand without looking at the solution
- Topological sort: implement from memory — it's the core of every autograd engine
- JVP vs VJP crossover: n/m ratio determines the efficient mode
- HVP trick: derive from the chain rule
- Checkpointing: can you explain the scheme in one paragraph?
- IFT formula: derive it by differentiating implicitly
References
- Griewank & Walther, Evaluating Derivatives: Principles and Techniques of Algorithmic Differentiation (2008)
- Baydin et al., Automatic Differentiation in Machine Learning: a Survey (JMLR 2018)
- Paszke et al., Automatic differentiation in PyTorch (NeurIPS 2017)
- Finn, Abbeel & Levine, Model-Agnostic Meta-Learning (ICML 2017)
- Chen et al., Training Deep Nets with Sublinear Memory Cost (2016)
- Blondel et al., Efficient and Modular Implicit Differentiation (NeurIPS 2022)