Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Numerical Optimization — Exercises
Practice is the price of mastery.
Companion: notes.md | theory.ipynb
Eight graded exercises covering line search, Newton's method, L-BFGS, finite differences, trust regions, and modern ML optimizers.
Difficulty: ★ = Straightforward | ★★ = Moderate | ★★★ = Challenging
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as nla
from scipy import stats
import scipy.linalg as la
try:
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style='whitegrid', palette='colorblind')
HAS_SNS = True
except ImportError:
plt.style.use('seaborn-v0_8-whitegrid')
HAS_SNS = False
mpl.rcParams.update({
'figure.figsize': (10, 6), 'figure.dpi': 120,
'font.size': 13, 'axes.titlesize': 15, 'axes.labelsize': 13,
'xtick.labelsize': 11, 'ytick.labelsize': 11,
'legend.fontsize': 11, 'lines.linewidth': 2.0,
'axes.spines.top': False, 'axes.spines.right': False,
})
HAS_MPL = True
except ImportError:
HAS_MPL = False
COLORS = {
'primary': '#0077BB',
'secondary': '#EE7733',
'tertiary': '#009988',
'error': '#CC3311',
'neutral': '#555555',
}
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
print('Setup complete.')
print("Chapter helper setup complete.")
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def check_true(name, cond):
print(f"{'PASS' if cond else 'FAIL'} - {name}")
return bool(cond)
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
if not ok:
print(' expected:', expected)
print(' got :', got)
return bool(ok)
print("Chapter helper setup complete.")
Exercise 1 ★ — Backtracking Armijo Line Search
The Armijo (sufficient decrease) condition requires that the step satisfies:
Task: Implement armijo_backtrack(f, grad_f, x, p, alpha0=1.0, rho=0.5, c1=1e-4) that repeatedly halves until the Armijo condition is satisfied.
Test function: (an elongated bowl).
(a) Starting from with steepest-descent direction , run backtracking and print each trial and -value.
(b) Verify that the returned satisfies the Armijo condition.
(c) Run gradient descent with your line search for 100 iterations. Confirm the objective decreases monotonically.
Code cell 5
# Your Solution
# Exercise 1 — Scaffold
def f1(x):
return x[0]**2 + 10*x[1]**2
def grad_f1(x):
return np.array([2*x[0], 20*x[1]])
def armijo_backtrack(f, grad_f, x, p, alpha0=1.0, rho=0.5, c1=1e-4):
"""
Backtracking Armijo line search.
Returns: alpha satisfying the sufficient decrease condition.
"""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 6
# Solution
def armijo_backtrack_sol(f, grad_f, x, p, alpha0=1.0, rho=0.5, c1=1e-4):
alpha = alpha0
f0 = f(x)
slope = grad_f(x) @ p
assert slope < 0, 'p must be a descent direction'
for _ in range(100):
if f(x + alpha * p) <= f0 + c1 * alpha * slope:
break
alpha *= rho
return alpha
# (a)
x0 = np.array([3.0, 1.0])
p = -grad_f1(x0)
alpha = armijo_backtrack_sol(f1, grad_f1, x0, p, alpha0=1.0, rho=0.5)
print(f'Chosen alpha: {alpha:.6f}')
print(f'f(x0) = {f1(x0):.6f}')
print(f'f(x0+a*p) = {f1(x0 + alpha*p):.6f}')
# (b)
c1 = 1e-4
lhs = f1(x0 + alpha * p)
rhs = f1(x0) + c1 * alpha * (grad_f1(x0) @ p)
ok = lhs <= rhs
print(f'\nArmijo: {lhs:.8f} <= {rhs:.8f} -> {"PASS" if ok else "FAIL"}')
# (c) Gradient descent
x = x0.copy()
fvals = [f1(x)]
for _ in range(100):
g = grad_f1(x)
p = -g
a = armijo_backtrack_sol(f1, grad_f1, x, p)
x = x + a * p
fvals.append(f1(x))
monotone = all(fvals[i+1] <= fvals[i] for i in range(len(fvals)-1))
print(f'\nFinal f: {fvals[-1]:.2e}')
print(f'Monotone decrease: {"PASS" if monotone else "FAIL"}')
Exercise 2 ★ — Newton's Method with Wolfe Line Search
Newton's method uses the direction where . It achieves quadratic convergence near a strongly convex minimum.
Test function: Rosenbrock .
(a) Implement newton_step(grad, hess) that returns using np.linalg.solve. If the Hessian is indefinite (Cholesky fails), fall back to steepest descent.
(b) Run 50 Newton steps from with a simple backtracking line search (Armijo, , ). Print the iteration, -value, and every 10 steps.
(c) Verify that once , subsequent iterates converge quadratically (print the log-ratio of successive gradient norms).
Code cell 8
# Your Solution
# Exercise 2 — Scaffold
def rosenbrock(x):
return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2
def rosenbrock_grad(x):
g0 = -2*(1 - x[0]) - 400*x[0]*(x[1] - x[0]**2)
g1 = 200*(x[1] - x[0]**2)
return np.array([g0, g1])
def rosenbrock_hess(x):
h00 = 2 - 400*(x[1] - x[0]**2) + 800*x[0]**2
h01 = -400*x[0]
h11 = 200.0
return np.array([[h00, h01], [h01, h11]])
def newton_step(grad, hess):
"""
Returns Newton direction -H^{-1}g.
Falls back to -g if H is indefinite.
"""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 9
# Solution
def newton_step_sol(g, H):
try:
la.cholesky(H) # Check positive definiteness
return -la.solve(H, g)
except la.LinAlgError:
return -g # Fall back to steepest descent
def armijo_sol(f, gf, x, p, rho=0.5, c1=1e-4):
alpha, f0, slope = 1.0, f(x), gf(x) @ p
if slope >= 0:
return 1e-8
for _ in range(60):
if f(x + alpha*p) <= f0 + c1*alpha*slope:
break
alpha *= rho
return alpha
# (b)
x = np.array([-1.0, 1.0])
grad_norms = []
for k in range(50):
g = rosenbrock_grad(x)
H = rosenbrock_hess(x)
p = newton_step_sol(g, H)
a = armijo_sol(rosenbrock, rosenbrock_grad, x, p)
x = x + a * p
grad_norms.append(np.linalg.norm(rosenbrock_grad(x)))
if k % 10 == 0:
print(f' k={k:2d} f={rosenbrock(x):.4e} ||g||={grad_norms[-1]:.4e}')
print(f'\nFinal x: {x}')
print(f'||g|| : {grad_norms[-1]:.2e}')
# (c) Quadratic convergence check
near_min = [gn for gn in grad_norms if gn < 1e-3]
print('\nLog-ratio of gradient norms (should approach 2 for quadratic convergence):')
for i in range(1, min(5, len(near_min))):
if near_min[i-1] > 0 and near_min[i] > 0:
ratio = np.log(near_min[i]) / np.log(near_min[i-1])
print(f' log(||g_{i}||)/log(||g_{i-1}||) = {ratio:.4f}')
ok = grad_norms[-1] < 1e-6
print(f'\nPASS: ||g|| < 1e-6' if ok else 'WARN: did not fully converge')
Exercise 3 ★★ — L-BFGS Two-Loop Recursion
The L-BFGS direction is computed without storing the full Hessian approximation using the two-loop recursion. Given the last curvature pairs with and :
(a) Implement lbfgs_direction(g, s_list, y_list) returning via the two-loop recursion with .
(b) Verify correctness: when with a single pair , the direction should equal the rank-1 BFGS update applied to .
(c) Run L-BFGS with on the Rosenbrock function from . Compare the iteration count and final -value against gradient descent with the same line search.
Code cell 11
# Your Solution
# Exercise 3 — Scaffold
def lbfgs_direction(g, s_list, y_list):
"""
L-BFGS two-loop recursion.
g : current gradient (1-D array)
s_list : list of s_i = x_{i+1} - x_i, most recent last
y_list : list of y_i = g_{i+1} - g_i, most recent last
Returns: H_k^{-1} g (descent direction is its negative)
"""
m = len(s_list)
if m == 0:
return g.copy()
q = g.copy()
alphas = []
# First loop: iterate from most recent to oldest
for i in range(m - 1, -1, -1):
s, y = s_list[i], y_list[i]
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 12
# Solution
def lbfgs_direction_sol(g, s_list, y_list):
m = len(s_list)
if m == 0:
return g.copy()
q = g.copy()
alphas = []
rhos = []
for i in range(m - 1, -1, -1):
s, y = s_list[i], y_list[i]
ys = y @ s
if ys <= 0:
alphas.insert(0, 0.0)
rhos.insert(0, 0.0)
continue
rho_i = 1.0 / ys
rhos.insert(0, rho_i)
a = rho_i * (s @ q)
alphas.insert(0, a)
q = q - a * y
# Scaling
s_last, y_last = s_list[-1], y_list[-1]
gamma = (y_last @ s_last) / (y_last @ y_last + 1e-16)
r = gamma * q
for i in range(m):
s, y = s_list[i], y_list[i]
if rhos[i] == 0:
continue
beta = rhos[i] * (y @ r)
r = r + s * (alphas[i] - beta)
return r
# (b) Single-pair check
np.random.seed(7)
g_test = np.random.randn(4)
s0 = np.random.randn(4) * 0.1
y0 = np.random.randn(4) * 0.1
# Ensure y@s > 0
y0 = y0 - (y0@s0)/(s0@s0)*s0 + 0.1*s0
d = lbfgs_direction_sol(g_test, [s0], [y0])
print(f'L-BFGS direction (m=1): {d.round(6)}')
print(f'Descent check (d @ g < 0): {"PASS" if d @ g_test > 0 else "FAIL"} (direction is H^-1 g, not -H^-1 g)')
# (c) L-BFGS on Rosenbrock
def run_lbfgs(x0, m=5, max_iter=500, tol=1e-10):
x = x0.copy()
s_list, y_list = [], []
fvals = [rosenbrock(x)]
for k in range(max_iter):
g = rosenbrock_grad(x)
if np.linalg.norm(g) < tol:
break
d = lbfgs_direction_sol(g, s_list, y_list)
p = -d
alpha = 1.0
f0, slope = rosenbrock(x), rosenbrock_grad(x) @ p
if slope >= 0:
p = -g; slope = rosenbrock_grad(x) @ p
for _ in range(60):
if rosenbrock(x + alpha*p) <= f0 + 1e-4*alpha*slope:
break
alpha *= 0.5
x_new = x + alpha * p
g_new = rosenbrock_grad(x_new)
s = x_new - x
y = g_new - g
if y @ s > 1e-10:
s_list.append(s)
y_list.append(y)
if len(s_list) > m:
s_list.pop(0)
y_list.pop(0)
x = x_new
fvals.append(rosenbrock(x))
return x, fvals, k+1
x0 = np.array([-1.5, 0.5])
x_lbfgs, fvals_lbfgs, iters_lbfgs = run_lbfgs(x0, m=5)
print(f'\nL-BFGS: {iters_lbfgs} iters, f={fvals_lbfgs[-1]:.4e}, x={x_lbfgs.round(6)}')
# Gradient descent for comparison
x_gd = x0.copy()
fvals_gd = [rosenbrock(x_gd)]
for k in range(10000):
g = rosenbrock_grad(x_gd)
if np.linalg.norm(g) < 1e-10:
break
p = -g
f0, sl = rosenbrock(x_gd), g @ p
a = 1.0
for _ in range(60):
if rosenbrock(x_gd+a*p) <= f0+1e-4*a*sl: break
a *= 0.5
x_gd = x_gd + a*p
fvals_gd.append(rosenbrock(x_gd))
print(f'GD: {k+1} iters, f={fvals_gd[-1]:.4e}')
print(f'\nL-BFGS speedup: ~{(k+1)/iters_lbfgs:.0f}x fewer iterations')
Exercise 4 ★★ — Finite Difference Gradient Check
Gradient checking is the practical workaround when analytical gradients might contain bugs. The centered difference approximation is:
The optimal step size balances truncation error vs rounding error :
(a) Implement finite_diff_grad(f, x, h) using centered differences.
(b) For the function , compare the finite-difference gradient to the analytical gradient for step sizes (log-spaced). Plot the relative error vs and identify the optimal step size.
(c) Implement the complex-step method: using f evaluated at complex inputs. Verify it achieves accuracy independent of (for not too large).
Code cell 14
# Your Solution
# Exercise 4 — Scaffold
def f4(x):
"""f(x) = sin(x1) + exp(x2)*x3^2"""
return np.sin(x[0]) + np.exp(x[1]) * x[2]**2
def grad_f4_analytic(x):
"""Analytical gradient of f4."""
return np.array([
np.cos(x[0]),
np.exp(x[1]) * x[2]**2,
2 * np.exp(x[1]) * x[2],
])
def finite_diff_grad(f, x, h=1e-5):
"""Centered finite difference gradient. Returns array of shape x.shape."""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 15
# Solution
def finite_diff_grad_sol(f, x, h=1e-5):
grad = np.zeros_like(x, dtype=float)
for i in range(len(x)):
xp, xm = x.copy(), x.copy()
xp[i] += h; xm[i] -= h
grad[i] = (f(xp) - f(xm)) / (2 * h)
return grad
def complex_step_grad_sol(f, x, h=1e-150):
grad = np.zeros(len(x))
for i in range(len(x)):
xc = x.astype(complex).copy()
xc[i] += 1j * h
grad[i] = np.imag(f(xc)) / h
return grad
x0 = np.array([0.5, -0.3, 1.2])
g_true = grad_f4_analytic(x0)
h_vals = np.logspace(-15, -1, 60)
errors = []
for h in h_vals:
g_fd = finite_diff_grad_sol(f4, x0, h)
errors.append(np.linalg.norm(g_fd - g_true) / np.linalg.norm(g_true))
optimal_h = h_vals[np.argmin(errors)]
print(f'Optimal h: {optimal_h:.2e} (expected ~6e-6)')
print(f'Min relative error: {min(errors):.2e}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.loglog(h_vals, errors, color=COLORS['primary'], linewidth=2)
ax.axvline(optimal_h, color=COLORS['error'], linestyle='--', label=f'Optimal h={optimal_h:.1e}')
ax.set_title('Finite-difference gradient error vs step size')
ax.set_xlabel('Step size h')
ax.set_ylabel('Relative error')
ax.legend(); plt.show()
# Complex step
g_cs = complex_step_grad_sol(f4, x0, h=1e-150)
cs_err = np.linalg.norm(g_cs - g_true) / np.linalg.norm(g_true)
print(f'\nComplex-step relative error: {cs_err:.2e} (should be ~machine eps)')
ok = cs_err < 1e-10
print(f'PASS: complex-step achieves near-machine precision' if ok else 'FAIL: error too large')
Exercise 5 ★★ — Trust Region vs Line Search
Trust region methods choose the step size first (trust radius ) and find the best direction within the ball . The ratio determines whether to accept the step and how to update .
(a) Implement cauchy_point(g, B, delta) that computes the Cauchy point:
(b) Implement trust_region_step(f, grad_f, hess_f, x, delta) that:
- Computes the Cauchy point as the step direction
- Evaluates and accepts/rejects
- Updates : expand if , contract if
(c) Run 50 trust-region steps on with = tridiagonal , . Compare convergence to gradient descent with backtracking.
Code cell 17
# Your Solution
# Exercise 5 — Scaffold
def cauchy_point(g, B, delta):
"""
Cauchy point for trust region subproblem.
g: gradient, B: Hessian (or approx), delta: trust radius.
Returns: Cauchy point p^C (within trust region).
"""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 18
# Solution
def cauchy_point_sol(g, B, delta):
Bg = B @ g
gBg = g @ Bg
gnorm = np.linalg.norm(g)
if gnorm < 1e-15:
return np.zeros_like(g)
tau = 1.0
if gBg > 0:
tau = min(1.0, gnorm**3 / (delta * gBg))
p = -tau * (delta / gnorm) * g
return p
def trust_region_step_sol(f, grad_f, hess_f, x, delta,
eta1=0.1, eta2=0.75, t1=0.5, t2=2.0):
g = grad_f(x)
B = hess_f(x)
p = cauchy_point_sol(g, B, delta)
actual = f(x) - f(x + p)
predicted = -(g @ p + 0.5 * p @ B @ p)
if predicted < 1e-15:
return x, delta * t1, False
rho = actual / predicted
x_new = x + p if rho > eta1 else x
if rho < 0.25:
delta_new = t1 * delta
elif rho > eta2 and abs(np.linalg.norm(p) - delta) < 1e-10:
delta_new = min(t2 * delta, 1e8)
else:
delta_new = delta
return x_new, delta_new, rho > eta1
# (c) Quadratic problem
n = 10
diags = [2*np.ones(n), -np.ones(n-1), -np.ones(n-1)]
A_quad = np.diag(diags[0]) + np.diag(diags[1], 1) + np.diag(diags[2], -1)
b_quad = np.ones(n)
f_quad = lambda x: 0.5 * x @ A_quad @ x - b_quad @ x
gf_quad = lambda x: A_quad @ x - b_quad
hf_quad = lambda x: A_quad
x0 = np.zeros(n)
# Trust region
x_tr, delta, fvals_tr = x0.copy(), 1.0, [f_quad(x0)]
for _ in range(50):
x_tr, delta, _ = trust_region_step_sol(f_quad, gf_quad, hf_quad, x_tr, delta)
fvals_tr.append(f_quad(x_tr))
# Gradient descent
x_gd2, fvals_gd2 = x0.copy(), [f_quad(x0)]
for _ in range(50):
g = gf_quad(x_gd2)
p = -g
a, f0, sl = 1.0, f_quad(x_gd2), g @ p
for _ in range(40):
if f_quad(x_gd2+a*p) <= f0+1e-4*a*sl: break
a *= 0.5
x_gd2 += a * p
fvals_gd2.append(f_quad(x_gd2))
x_exact = la.solve(A_quad, b_quad)
print(f'Trust region final f: {fvals_tr[-1]:.4e}')
print(f'GD final f: {fvals_gd2[-1]:.4e}')
print(f'Exact minimum: {f_quad(x_exact):.4e}')
ok = abs(fvals_tr[-1] - f_quad(x_exact)) < 1e-6
print(f'PASS: trust region converges to optimum' if ok else 'WARN: check convergence')
Exercise 6 ★★ — Adam as Diagonal Quasi-Newton
Adam maintains first and second moment estimates of the gradient:
The effective step approximates a diagonal Hessian inverse with .
(a) Implement adam(grad_f, x0, lr=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, T=1000).
(b) Minimize where are log-uniformly spaced in (highly ill-conditioned diagonal quadratic). Compare Adam, SGD-with-momentum, and diagonal Newton () after 500 steps.
(c) Show that Adam's effective per-coordinate learning rate converges to (approximately the Newton step). Print the ratio effective_lr / newton_lr for each coordinate at convergence.
Code cell 20
# Your Solution
# Exercise 6 — Scaffold
def adam(grad_f, x0, lr=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, T=1000):
"""
Adam optimizer.
grad_f: function returning gradient
x0: initial point
Returns: (final x, loss history)
"""
x = x0.copy().astype(float)
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 21
# Solution
def adam_sol(grad_f, x0, lr=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, T=1000):
x = x0.copy().astype(float)
m = np.zeros_like(x)
v = np.zeros_like(x)
fvals = []
for t in range(1, T + 1):
g = grad_f(x)
m = beta1 * m + (1 - beta1) * g
v = beta2 * v + (1 - beta2) * g**2
m_hat = m / (1 - beta1**t)
v_hat = v / (1 - beta2**t)
x = x - lr * m_hat / (np.sqrt(v_hat) + eps)
fvals.append(np.sum(c * x**2))
return x, fvals
d = 20
c = np.logspace(0, 3, d)
f6 = lambda x: np.sum(c * x**2)
gf6 = lambda x: 2 * c * x
x0 = np.ones(d)
x_adam, fvals_adam = adam_sol(gf6, x0, lr=1e-2, T=2000)
# SGD + momentum
x_sgd = x0.copy()
v_sgd = np.zeros(d)
fvals_sgd = []
for t in range(2000):
g = gf6(x_sgd)
v_sgd = 0.9 * v_sgd - 0.001 * g
x_sgd += v_sgd
fvals_sgd.append(f6(x_sgd))
# Diagonal Newton (exact for this quadratic)
x_dn = x0.copy()
fvals_dn = []
for t in range(2000):
g = gf6(x_dn)
x_dn = x_dn - g / (2 * c) # Exact Newton step for f=c*x^2
fvals_dn.append(f6(x_dn))
print('After 2000 steps:')
print(f' Adam: f = {fvals_adam[-1]:.4e}')
print(f' SGD+momentum: f = {fvals_sgd[-1]:.4e}')
print(f' Diagonal Newton: f = {fvals_dn[-1]:.4e}')
# (c) Effective learning rate analysis
# Re-run Adam and inspect v_hat at convergence
m2 = np.zeros(d); v2 = np.zeros(d)
x2 = x0.copy()
beta1, beta2, eps = 0.9, 0.999, 1e-8
for t in range(1, 5001):
g = gf6(x2)
m2 = beta1*m2 + (1-beta1)*g
v2 = beta2*v2 + (1-beta2)*g**2
m_hat = m2/(1-beta1**t)
v_hat = v2/(1-beta2**t)
x2 = x2 - 1e-2 * m_hat / (np.sqrt(v_hat) + eps)
effective_lr = 1e-2 / (np.sqrt(v_hat) + eps)
newton_lr = 1.0 / (2*c)
print('\nEffective LR / Newton LR (first 5 coordinates):')
print((effective_lr / newton_lr)[:5].round(4))
Exercise 7 ★★ — Mixed-Precision Gradient Accumulation
Modern LLM training uses mixed precision: forward and backward passes in bf16/fp16, but gradient accumulation and parameter updates in fp32. Without loss scaling, fp16 gradients underflow to zero.
(a) Simulate fp16 gradient underflow: compute gradients for a simple network where the true gradient has magnitude (below fp16 minimum normal ). Show that np.float16(grad) rounds to zero but np.float32(grad) does not.
(b) Implement loss scaling: multiply the loss by , compute gradients, then divide by before the weight update. Verify the unscaled gradient matches fp32.
(c) Implement a simplified dynamic loss scaler that:
- Doubles every 2000 steps if no overflow
- Halves and skips the update when
infornanappears in scaled gradients Simulate 100 steps where a random 5% of steps would overflow. Print the scale trajectory.
Code cell 23
# Your Solution
# Exercise 7 — Scaffold
# (a) fp16 underflow
grad_true = 3e-5 # True gradient magnitude
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 24
# Solution
# (a) fp16 underflow demonstration
grad_true = 3e-5
g_fp16 = np.float16(grad_true)
g_fp32 = np.float32(grad_true)
print(f'True gradient: {grad_true}')
print(f'fp16 gradient: {g_fp16} (underflows to {g_fp16:.6f})')
print(f'fp32 gradient: {g_fp32}')
print(f'fp16 == 0: {g_fp16 == 0}')
print(f'fp32 == 0: {g_fp32 == 0}')
# (b) Loss scaling
S = np.float32(2**15)
scaled_loss = np.float32(grad_true) * S
scaled_g_fp16 = np.float16(scaled_loss) # Now representable in fp16
unscaled = np.float32(scaled_g_fp16) / S
print(f'\nWith scale {S:.0f}:')
print(f' Scaled grad (fp16): {scaled_g_fp16}')
print(f' Unscaled: {unscaled}')
print(f' Matches fp32: {abs(unscaled - g_fp32) < 1e-6}')
# (c) Dynamic loss scaler
class DynamicLossScalerSol:
def __init__(self, init_scale=2**15, growth_interval=2000, growth_factor=2.0, backoff_factor=0.5):
self.scale = float(init_scale)
self.growth_interval = growth_interval
self.growth_factor = growth_factor
self.backoff_factor = backoff_factor
self.steps_since_overflow = 0
def step(self, grads_ok):
"""grads_ok: True if no inf/nan in scaled gradients."""
if not grads_ok:
self.scale *= self.backoff_factor
self.steps_since_overflow = 0
return False # Skip update
self.steps_since_overflow += 1
if self.steps_since_overflow >= self.growth_interval:
self.scale *= self.growth_factor
self.steps_since_overflow = 0
return True
np.random.seed(0)
scaler = DynamicLossScalerSol(init_scale=2**10, growth_interval=20)
scales = [scaler.scale]
skipped = 0
for step in range(100):
overflow = np.random.random() < 0.05 # 5% overflow rate
ok = scaler.step(not overflow)
if not ok:
skipped += 1
scales.append(scaler.scale)
print(f'\nDynamic scaler over 100 steps:')
print(f' Skipped updates: {skipped}')
print(f' Final scale: {scales[-1]:.0f}')
print(f' Scale trajectory (every 20 steps): {[f"{s:.0f}" for s in scales[::20]]}')
Exercise 8 ★★★ — Hessian-Vector Products and Curvature Analysis
Computing the full Hessian is in memory. A Hessian-vector product (HVP) can be computed in via:
(a) Implement hvp_finite_diff(grad_f, x, v, eps=1e-5) using the forward difference approximation of the HVP.
(b) Verify correctness: for , . Test with a random PD matrix .
(c) Use the power method with HVPs to estimate the largest eigenvalue of the Hessian at the Rosenbrock minimum . Compare to np.linalg.eigvalsh(rosenbrock_hess([1,1])).
(d) Implement one step of Lanczos iteration using HVPs to estimate both and without forming explicitly. Report the condition number .
Code cell 26
# Your Solution
# Exercise 8 — Scaffold
def hvp_finite_diff(grad_f, x, v, eps=1e-5):
"""
Hessian-vector product via finite differences.
Returns H(x) @ v.
"""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 27
# Solution
def hvp_finite_diff_sol(grad_f, x, v, eps=1e-5):
return (grad_f(x + eps * v) - grad_f(x)) / eps
# (b) Verification
np.random.seed(42)
n = 50
M = np.random.randn(n, n)
A_pd = M.T @ M / n + np.eye(n) * 0.1 # Positive definite
b_pd = np.random.randn(n)
f_quad_nd = lambda x: 0.5 * x @ A_pd @ x - b_pd @ x
gf_quad_nd = lambda x: A_pd @ x - b_pd
x_test = np.random.randn(n)
v_test = np.random.randn(n)
hvp_fd = hvp_finite_diff_sol(gf_quad_nd, x_test, v_test)
hvp_exact = A_pd @ v_test
err = np.linalg.norm(hvp_fd - hvp_exact) / np.linalg.norm(hvp_exact)
print(f'HVP relative error: {err:.2e}')
ok = err < 1e-4
print(f'PASS: HVP finite diff accurate' if ok else f'FAIL: error = {err:.2e}')
# (c) Power method at Rosenbrock minimum
x_min = np.array([1.0, 1.0])
v = np.random.randn(2)
v /= np.linalg.norm(v)
lambda_est = 0.0
for _ in range(100):
Hv = hvp_finite_diff_sol(rosenbrock_grad, x_min, v)
lambda_est = np.linalg.norm(Hv)
v = Hv / lambda_est
H_exact = rosenbrock_hess(x_min)
evals_exact = np.sort(np.linalg.eigvalsh(H_exact))[::-1]
print(f'\nPower method lambda_max: {lambda_est:.6f}')
print(f'Exact lambda_max: {evals_exact[0]:.6f}')
print(f'PASS' if abs(lambda_est - evals_exact[0])/evals_exact[0] < 1e-4 else 'FAIL')
# (d) Lanczos iteration
def lanczos_hvp_sol(grad_f, x, k=10):
n = len(x)
q = np.random.randn(n); q /= np.linalg.norm(q)
Q = [q]
alphas, betas = [], []
for j in range(k):
z = hvp_finite_diff_sol(grad_f, x, Q[-1])
a = Q[-1] @ z
alphas.append(a)
if j == 0:
r = z - a * Q[-1]
else:
r = z - a * Q[-1] - betas[-1] * Q[-2]
b = np.linalg.norm(r)
betas.append(b)
if b < 1e-12:
break
Q.append(r / b)
T = np.diag(alphas) + np.diag(betas[:-1], 1) + np.diag(betas[:-1], -1)
evals = np.linalg.eigvalsh(T)
return evals.max(), evals.min()
lam_max, lam_min = lanczos_hvp_sol(rosenbrock_grad, x_min, k=2)
kappa_est = lam_max / max(abs(lam_min), 1e-10)
kappa_exact = evals_exact[0] / evals_exact[-1]
print(f'\nLanczos: lambda_max={lam_max:.4f}, lambda_min={lam_min:.4f}')
print(f'Estimated condition number: {kappa_est:.2f}')
print(f'Exact condition number: {kappa_exact:.2f}')
Exercise 9 *** - Gradient Clipping as a Trust Region
Compare raw gradient descent with clipped updates on a steep quadratic.
Code cell 29
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 30
# Solution
H=np.diag([100.,1.]); theta=np.array([5.,5.]); eta=0.05
def loss(t): return 0.5*t@(H@t)
def step(t,clip=None):
g=H@t; upd=-eta*g
if clip is not None and np.linalg.norm(upd)>clip: upd=upd/np.linalg.norm(upd)*clip
return t+upd
raw=theta.copy(); clipped=theta.copy()
for _ in range(20): raw=step(raw); clipped=step(clipped,clip=0.5)
header('Exercise 9: Gradient Clipping')
print(f'raw loss={loss(raw):.3e}, clipped loss={loss(clipped):.3e}')
check_true('clipped path remains finite and improves', np.isfinite(loss(clipped)) and loss(clipped)<loss(theta))
print('Takeaway: clipping behaves like a simple trust-region constraint on update size.')
Exercise 10 *** - Hessian-Vector Products by Finite Differences
Approximate without forming using a gradient finite difference.
Code cell 32
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 33
# Solution
Q=np.array([[5.,1.],[1.,2.]]); b=np.array([1.,-3.])
def grad(x): return Q@x+b
x=np.array([0.7,-1.2]); v=np.array([2.,-0.5]); eps=1e-5
hvp_fd=(grad(x+eps*v)-grad(x))/eps
hvp_true=Q@v
header('Exercise 10: Hessian-Vector Products')
print('finite diff:',hvp_fd,'true:',hvp_true)
check_true('HVP finite difference is accurate', np.allclose(hvp_fd,hvp_true,atol=1e-7))
print('Takeaway: Hessian-vector products enable second-order methods without materializing dense Hessians.')