Exercises NotebookMath for LLMs

Numerical Optimization

Numerical Methods / Numerical Optimization

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Numerical Optimization — Exercises

Practice is the price of mastery.

Companion: notes.md | theory.ipynb

Eight graded exercises covering line search, Newton's method, L-BFGS, finite differences, trust regions, and modern ML optimizers.

Difficulty: ★ = Straightforward | ★★ = Moderate | ★★★ = Challenging

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as nla
from scipy import stats
import scipy.linalg as la

try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    try:
        import seaborn as sns
        sns.set_theme(style='whitegrid', palette='colorblind')
        HAS_SNS = True
    except ImportError:
        plt.style.use('seaborn-v0_8-whitegrid')
        HAS_SNS = False
    mpl.rcParams.update({
        'figure.figsize': (10, 6), 'figure.dpi': 120,
        'font.size': 13, 'axes.titlesize': 15, 'axes.labelsize': 13,
        'xtick.labelsize': 11, 'ytick.labelsize': 11,
        'legend.fontsize': 11, 'lines.linewidth': 2.0,
        'axes.spines.top': False, 'axes.spines.right': False,
    })
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

COLORS = {
    'primary':   '#0077BB',
    'secondary': '#EE7733',
    'tertiary':  '#009988',
    'error':     '#CC3311',
    'neutral':   '#555555',
}

np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
print('Setup complete.')

print("Chapter helper setup complete.")

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    print(f"{'PASS' if cond else 'FAIL'} - {name}")
    return bool(cond)

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    if not ok:
        print('  expected:', expected)
        print('  got     :', got)
    return bool(ok)

print("Chapter helper setup complete.")

Exercise 1 ★ — Backtracking Armijo Line Search

The Armijo (sufficient decrease) condition requires that the step α\alpha satisfies:

f(xk+αpk)f(xk)+c1αf(xk)pkf(x_k + \alpha p_k) \leq f(x_k) + c_1 \alpha \nabla f(x_k)^\top p_k

Task: Implement armijo_backtrack(f, grad_f, x, p, alpha0=1.0, rho=0.5, c1=1e-4) that repeatedly halves α\alpha until the Armijo condition is satisfied.

Test function: f(x)=x12+10x22f(x) = x_1^2 + 10 x_2^2 (an elongated bowl).

(a) Starting from x0=(3,1)x_0 = (3, 1) with steepest-descent direction p=f(x0)p = -\nabla f(x_0), run backtracking and print each trial α\alpha and ff-value.

(b) Verify that the returned α\alpha satisfies the Armijo condition.

(c) Run gradient descent with your line search for 100 iterations. Confirm the objective decreases monotonically.

Code cell 5

# Your Solution
# Exercise 1 — Scaffold

def f1(x):
    return x[0]**2 + 10*x[1]**2

def grad_f1(x):
    return np.array([2*x[0], 20*x[1]])

def armijo_backtrack(f, grad_f, x, p, alpha0=1.0, rho=0.5, c1=1e-4):
    """
    Backtracking Armijo line search.
    Returns: alpha satisfying the sufficient decrease condition.
    """

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 6

# Solution
def armijo_backtrack_sol(f, grad_f, x, p, alpha0=1.0, rho=0.5, c1=1e-4):
    alpha = alpha0
    f0 = f(x)
    slope = grad_f(x) @ p
    assert slope < 0, 'p must be a descent direction'
    for _ in range(100):
        if f(x + alpha * p) <= f0 + c1 * alpha * slope:
            break
        alpha *= rho
    return alpha

# (a)
x0 = np.array([3.0, 1.0])
p = -grad_f1(x0)
alpha = armijo_backtrack_sol(f1, grad_f1, x0, p, alpha0=1.0, rho=0.5)
print(f'Chosen alpha: {alpha:.6f}')
print(f'f(x0)       = {f1(x0):.6f}')
print(f'f(x0+a*p)   = {f1(x0 + alpha*p):.6f}')

# (b)
c1 = 1e-4
lhs = f1(x0 + alpha * p)
rhs = f1(x0) + c1 * alpha * (grad_f1(x0) @ p)
ok = lhs <= rhs
print(f'\nArmijo: {lhs:.8f} <= {rhs:.8f} -> {"PASS" if ok else "FAIL"}')

# (c) Gradient descent
x = x0.copy()
fvals = [f1(x)]
for _ in range(100):
    g = grad_f1(x)
    p = -g
    a = armijo_backtrack_sol(f1, grad_f1, x, p)
    x = x + a * p
    fvals.append(f1(x))

monotone = all(fvals[i+1] <= fvals[i] for i in range(len(fvals)-1))
print(f'\nFinal f: {fvals[-1]:.2e}')
print(f'Monotone decrease: {"PASS" if monotone else "FAIL"}')

Exercise 2 ★ — Newton's Method with Wolfe Line Search

Newton's method uses the direction pk=Hk1gkp_k = -H_k^{-1} g_k where Hk=2f(xk)H_k = \nabla^2 f(x_k). It achieves quadratic convergence near a strongly convex minimum.

Test function: Rosenbrock f(x)=(1x1)2+100(x2x12)2f(x) = (1 - x_1)^2 + 100(x_2 - x_1^2)^2.

(a) Implement newton_step(grad, hess) that returns H1g-H^{-1}g using np.linalg.solve. If the Hessian is indefinite (Cholesky fails), fall back to steepest descent.

(b) Run 50 Newton steps from x0=(1,1)x_0 = (-1, 1) with a simple backtracking line search (Armijo, ρ=0.5\rho = 0.5, c1=104c_1 = 10^{-4}). Print the iteration, ff-value, and f\|\nabla f\| every 10 steps.

(c) Verify that once f<103\|\nabla f\| < 10^{-3}, subsequent iterates converge quadratically (print the log-ratio of successive gradient norms).

Code cell 8

# Your Solution
# Exercise 2 — Scaffold

def rosenbrock(x):
    return (1 - x[0])**2 + 100*(x[1] - x[0]**2)**2

def rosenbrock_grad(x):
    g0 = -2*(1 - x[0]) - 400*x[0]*(x[1] - x[0]**2)
    g1 = 200*(x[1] - x[0]**2)
    return np.array([g0, g1])

def rosenbrock_hess(x):
    h00 = 2 - 400*(x[1] - x[0]**2) + 800*x[0]**2
    h01 = -400*x[0]
    h11 = 200.0
    return np.array([[h00, h01], [h01, h11]])

def newton_step(grad, hess):
    """
    Returns Newton direction -H^{-1}g.
    Falls back to -g if H is indefinite.
    """

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 9

# Solution
def newton_step_sol(g, H):
    try:
        la.cholesky(H)  # Check positive definiteness
        return -la.solve(H, g)
    except la.LinAlgError:
        return -g  # Fall back to steepest descent

def armijo_sol(f, gf, x, p, rho=0.5, c1=1e-4):
    alpha, f0, slope = 1.0, f(x), gf(x) @ p
    if slope >= 0:
        return 1e-8
    for _ in range(60):
        if f(x + alpha*p) <= f0 + c1*alpha*slope:
            break
        alpha *= rho
    return alpha

# (b)
x = np.array([-1.0, 1.0])
grad_norms = []
for k in range(50):
    g = rosenbrock_grad(x)
    H = rosenbrock_hess(x)
    p = newton_step_sol(g, H)
    a = armijo_sol(rosenbrock, rosenbrock_grad, x, p)
    x = x + a * p
    grad_norms.append(np.linalg.norm(rosenbrock_grad(x)))
    if k % 10 == 0:
        print(f'  k={k:2d}  f={rosenbrock(x):.4e}  ||g||={grad_norms[-1]:.4e}')

print(f'\nFinal x: {x}')
print(f'||g||  : {grad_norms[-1]:.2e}')

# (c) Quadratic convergence check
near_min = [gn for gn in grad_norms if gn < 1e-3]
print('\nLog-ratio of gradient norms (should approach 2 for quadratic convergence):')
for i in range(1, min(5, len(near_min))):
    if near_min[i-1] > 0 and near_min[i] > 0:
        ratio = np.log(near_min[i]) / np.log(near_min[i-1])
        print(f'  log(||g_{i}||)/log(||g_{i-1}||) = {ratio:.4f}')
ok = grad_norms[-1] < 1e-6
print(f'\nPASS: ||g|| < 1e-6' if ok else 'WARN: did not fully converge')

Exercise 3 ★★ — L-BFGS Two-Loop Recursion

The L-BFGS direction is computed without storing the full Hessian approximation using the two-loop recursion. Given the last mm curvature pairs (si,yi)(s_i, y_i) with si=xi+1xis_i = x_{i+1} - x_i and yi=gi+1giy_i = g_{i+1} - g_i:

ρi=1yisi,qgk,αiρisiq,qqαiyi\rho_i = \frac{1}{y_i^\top s_i}, \quad q \leftarrow g_k, \quad \alpha_i \leftarrow \rho_i s_i^\top q, \quad q \leftarrow q - \alpha_i y_i

(a) Implement lbfgs_direction(g, s_list, y_list) returning Hk1gH_k^{-1} g via the two-loop recursion with H0=yk1sk1yk1yk1IH_0 = \frac{y_{k-1}^\top s_{k-1}}{y_{k-1}^\top y_{k-1}} I.

(b) Verify correctness: when m=1m=1 with a single pair (s0,y0)(s_0, y_0), the direction should equal the rank-1 BFGS update applied to g-g.

(c) Run L-BFGS with m=5m = 5 on the Rosenbrock function from x0=(1.5,0.5)x_0 = (-1.5, 0.5). Compare the iteration count and final ff-value against gradient descent with the same line search.

Code cell 11

# Your Solution
# Exercise 3 — Scaffold

def lbfgs_direction(g, s_list, y_list):
    """
    L-BFGS two-loop recursion.
    g      : current gradient (1-D array)
    s_list : list of s_i = x_{i+1} - x_i, most recent last
    y_list : list of y_i = g_{i+1} - g_i, most recent last
    Returns: H_k^{-1} g (descent direction is its negative)
    """
    m = len(s_list)
    if m == 0:
        return g.copy()

    q = g.copy()
    alphas = []

    # First loop: iterate from most recent to oldest
    for i in range(m - 1, -1, -1):
        s, y = s_list[i], y_list[i]

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 12

# Solution
def lbfgs_direction_sol(g, s_list, y_list):
    m = len(s_list)
    if m == 0:
        return g.copy()
    q = g.copy()
    alphas = []
    rhos = []
    for i in range(m - 1, -1, -1):
        s, y = s_list[i], y_list[i]
        ys = y @ s
        if ys <= 0:
            alphas.insert(0, 0.0)
            rhos.insert(0, 0.0)
            continue
        rho_i = 1.0 / ys
        rhos.insert(0, rho_i)
        a = rho_i * (s @ q)
        alphas.insert(0, a)
        q = q - a * y
    # Scaling
    s_last, y_last = s_list[-1], y_list[-1]
    gamma = (y_last @ s_last) / (y_last @ y_last + 1e-16)
    r = gamma * q
    for i in range(m):
        s, y = s_list[i], y_list[i]
        if rhos[i] == 0:
            continue
        beta = rhos[i] * (y @ r)
        r = r + s * (alphas[i] - beta)
    return r

# (b) Single-pair check
np.random.seed(7)
g_test = np.random.randn(4)
s0 = np.random.randn(4) * 0.1
y0 = np.random.randn(4) * 0.1
# Ensure y@s > 0
y0 = y0 - (y0@s0)/(s0@s0)*s0 + 0.1*s0  
d = lbfgs_direction_sol(g_test, [s0], [y0])
print(f'L-BFGS direction (m=1): {d.round(6)}')
print(f'Descent check (d @ g < 0): {"PASS" if d @ g_test > 0 else "FAIL"} (direction is H^-1 g, not -H^-1 g)')

# (c) L-BFGS on Rosenbrock
def run_lbfgs(x0, m=5, max_iter=500, tol=1e-10):
    x = x0.copy()
    s_list, y_list = [], []
    fvals = [rosenbrock(x)]
    for k in range(max_iter):
        g = rosenbrock_grad(x)
        if np.linalg.norm(g) < tol:
            break
        d = lbfgs_direction_sol(g, s_list, y_list)
        p = -d
        alpha = 1.0
        f0, slope = rosenbrock(x), rosenbrock_grad(x) @ p
        if slope >= 0:
            p = -g; slope = rosenbrock_grad(x) @ p
        for _ in range(60):
            if rosenbrock(x + alpha*p) <= f0 + 1e-4*alpha*slope:
                break
            alpha *= 0.5
        x_new = x + alpha * p
        g_new = rosenbrock_grad(x_new)
        s = x_new - x
        y = g_new - g
        if y @ s > 1e-10:
            s_list.append(s)
            y_list.append(y)
            if len(s_list) > m:
                s_list.pop(0)
                y_list.pop(0)
        x = x_new
        fvals.append(rosenbrock(x))
    return x, fvals, k+1

x0 = np.array([-1.5, 0.5])
x_lbfgs, fvals_lbfgs, iters_lbfgs = run_lbfgs(x0, m=5)
print(f'\nL-BFGS: {iters_lbfgs} iters, f={fvals_lbfgs[-1]:.4e}, x={x_lbfgs.round(6)}')

# Gradient descent for comparison
x_gd = x0.copy()
fvals_gd = [rosenbrock(x_gd)]
for k in range(10000):
    g = rosenbrock_grad(x_gd)
    if np.linalg.norm(g) < 1e-10:
        break
    p = -g
    f0, sl = rosenbrock(x_gd), g @ p
    a = 1.0
    for _ in range(60):
        if rosenbrock(x_gd+a*p) <= f0+1e-4*a*sl: break
        a *= 0.5
    x_gd = x_gd + a*p
    fvals_gd.append(rosenbrock(x_gd))
print(f'GD:     {k+1} iters, f={fvals_gd[-1]:.4e}')
print(f'\nL-BFGS speedup: ~{(k+1)/iters_lbfgs:.0f}x fewer iterations')

Exercise 4 ★★ — Finite Difference Gradient Check

Gradient checking is the practical workaround when analytical gradients might contain bugs. The centered difference approximation is:

fxif(x+hei)f(xhei)2h=fxi+O(h2)\frac{\partial f}{\partial x_i} \approx \frac{f(x + h e_i) - f(x - h e_i)}{2h} = \frac{\partial f}{\partial x_i} + O(h^2)

The optimal step size balances truncation error O(h2)O(h^2) vs rounding error O(εmach/h)O(\varepsilon_{\text{mach}}/h):

hεmach1/36×106h^* \approx \varepsilon_{\text{mach}}^{1/3} \approx 6 \times 10^{-6}

(a) Implement finite_diff_grad(f, x, h) using centered differences.

(b) For the function f(x)=sin(x1)+ex2x32f(x) = \sin(x_1) + e^{x_2} x_3^2, compare the finite-difference gradient to the analytical gradient for step sizes h[1015,101]h \in [10^{-15}, 10^{-1}] (log-spaced). Plot the relative error vs hh and identify the optimal step size.

(c) Implement the complex-step method: fxiIm[f(x+ihei)]/h\frac{\partial f}{\partial x_i} \approx \text{Im}[f(x + ih e_i)] / h using f evaluated at complex inputs. Verify it achieves 1015\sim 10^{-15} accuracy independent of hh (for hh not too large).

Code cell 14

# Your Solution
# Exercise 4 — Scaffold

def f4(x):
    """f(x) = sin(x1) + exp(x2)*x3^2"""
    return np.sin(x[0]) + np.exp(x[1]) * x[2]**2

def grad_f4_analytic(x):
    """Analytical gradient of f4."""
    return np.array([
        np.cos(x[0]),
        np.exp(x[1]) * x[2]**2,
        2 * np.exp(x[1]) * x[2],
    ])

def finite_diff_grad(f, x, h=1e-5):
    """Centered finite difference gradient. Returns array of shape x.shape."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 15

# Solution
def finite_diff_grad_sol(f, x, h=1e-5):
    grad = np.zeros_like(x, dtype=float)
    for i in range(len(x)):
        xp, xm = x.copy(), x.copy()
        xp[i] += h; xm[i] -= h
        grad[i] = (f(xp) - f(xm)) / (2 * h)
    return grad

def complex_step_grad_sol(f, x, h=1e-150):
    grad = np.zeros(len(x))
    for i in range(len(x)):
        xc = x.astype(complex).copy()
        xc[i] += 1j * h
        grad[i] = np.imag(f(xc)) / h
    return grad

x0 = np.array([0.5, -0.3, 1.2])
g_true = grad_f4_analytic(x0)
h_vals = np.logspace(-15, -1, 60)
errors = []
for h in h_vals:
    g_fd = finite_diff_grad_sol(f4, x0, h)
    errors.append(np.linalg.norm(g_fd - g_true) / np.linalg.norm(g_true))

optimal_h = h_vals[np.argmin(errors)]
print(f'Optimal h: {optimal_h:.2e} (expected ~6e-6)')
print(f'Min relative error: {min(errors):.2e}')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.loglog(h_vals, errors, color=COLORS['primary'], linewidth=2)
    ax.axvline(optimal_h, color=COLORS['error'], linestyle='--', label=f'Optimal h={optimal_h:.1e}')
    ax.set_title('Finite-difference gradient error vs step size')
    ax.set_xlabel('Step size h')
    ax.set_ylabel('Relative error')
    ax.legend(); plt.show()

# Complex step
g_cs = complex_step_grad_sol(f4, x0, h=1e-150)
cs_err = np.linalg.norm(g_cs - g_true) / np.linalg.norm(g_true)
print(f'\nComplex-step relative error: {cs_err:.2e} (should be ~machine eps)')
ok = cs_err < 1e-10
print(f'PASS: complex-step achieves near-machine precision' if ok else 'FAIL: error too large')

Exercise 5 ★★ — Trust Region vs Line Search

Trust region methods choose the step size first (trust radius Δk\Delta_k) and find the best direction within the ball pΔk\|p\| \leq \Delta_k. The ratio ρk=actual reduction/predicted reduction\rho_k = \text{actual reduction} / \text{predicted reduction} determines whether to accept the step and how to update Δk\Delta_k.

(a) Implement cauchy_point(g, B, delta) that computes the Cauchy point:

pC=g2gBgg,clipped to pC=Δk if necessaryp^C = -\frac{\|g\|^2}{g^\top B g} g, \quad \text{clipped to } \|p^C\| = \Delta_k \text{ if necessary}

(b) Implement trust_region_step(f, grad_f, hess_f, x, delta) that:

  1. Computes the Cauchy point as the step direction
  2. Evaluates ρk\rho_k and accepts/rejects
  3. Updates Δk\Delta_k: expand if ρ>0.75\rho > 0.75, contract if ρ<0.25\rho < 0.25

(c) Run 50 trust-region steps on f(x)=0.5xAxbxf(x) = 0.5 x^\top A x - b^\top x with AA = tridiagonal (2,1,1)(2, -1, -1), b=1b = \mathbf{1}. Compare convergence to gradient descent with backtracking.

Code cell 17

# Your Solution
# Exercise 5 — Scaffold

def cauchy_point(g, B, delta):
    """
    Cauchy point for trust region subproblem.
    g: gradient, B: Hessian (or approx), delta: trust radius.
    Returns: Cauchy point p^C (within trust region).
    """

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 18

# Solution
def cauchy_point_sol(g, B, delta):
    Bg = B @ g
    gBg = g @ Bg
    gnorm = np.linalg.norm(g)
    if gnorm < 1e-15:
        return np.zeros_like(g)
    tau = 1.0
    if gBg > 0:
        tau = min(1.0, gnorm**3 / (delta * gBg))
    p = -tau * (delta / gnorm) * g
    return p

def trust_region_step_sol(f, grad_f, hess_f, x, delta,
                           eta1=0.1, eta2=0.75, t1=0.5, t2=2.0):
    g = grad_f(x)
    B = hess_f(x)
    p = cauchy_point_sol(g, B, delta)
    actual = f(x) - f(x + p)
    predicted = -(g @ p + 0.5 * p @ B @ p)
    if predicted < 1e-15:
        return x, delta * t1, False
    rho = actual / predicted
    x_new = x + p if rho > eta1 else x
    if rho < 0.25:
        delta_new = t1 * delta
    elif rho > eta2 and abs(np.linalg.norm(p) - delta) < 1e-10:
        delta_new = min(t2 * delta, 1e8)
    else:
        delta_new = delta
    return x_new, delta_new, rho > eta1

# (c) Quadratic problem
n = 10
diags = [2*np.ones(n), -np.ones(n-1), -np.ones(n-1)]
A_quad = np.diag(diags[0]) + np.diag(diags[1], 1) + np.diag(diags[2], -1)
b_quad = np.ones(n)
f_quad = lambda x: 0.5 * x @ A_quad @ x - b_quad @ x
gf_quad = lambda x: A_quad @ x - b_quad
hf_quad = lambda x: A_quad

x0 = np.zeros(n)
# Trust region
x_tr, delta, fvals_tr = x0.copy(), 1.0, [f_quad(x0)]
for _ in range(50):
    x_tr, delta, _ = trust_region_step_sol(f_quad, gf_quad, hf_quad, x_tr, delta)
    fvals_tr.append(f_quad(x_tr))
# Gradient descent
x_gd2, fvals_gd2 = x0.copy(), [f_quad(x0)]
for _ in range(50):
    g = gf_quad(x_gd2)
    p = -g
    a, f0, sl = 1.0, f_quad(x_gd2), g @ p
    for _ in range(40):
        if f_quad(x_gd2+a*p) <= f0+1e-4*a*sl: break
        a *= 0.5
    x_gd2 += a * p
    fvals_gd2.append(f_quad(x_gd2))

x_exact = la.solve(A_quad, b_quad)
print(f'Trust region final f:  {fvals_tr[-1]:.4e}')
print(f'GD final f:            {fvals_gd2[-1]:.4e}')
print(f'Exact minimum:         {f_quad(x_exact):.4e}')
ok = abs(fvals_tr[-1] - f_quad(x_exact)) < 1e-6
print(f'PASS: trust region converges to optimum' if ok else 'WARN: check convergence')

Exercise 6 ★★ — Adam as Diagonal Quasi-Newton

Adam maintains first and second moment estimates of the gradient:

mt=β1mt1+(1β1)gt,vt=β2vt1+(1β2)gt2m_t = \beta_1 m_{t-1} + (1-\beta_1) g_t, \quad v_t = \beta_2 v_{t-1} + (1-\beta_2) g_t^2 m^t=mt1β1t,v^t=vt1β2t\hat{m}_t = \frac{m_t}{1-\beta_1^t}, \quad \hat{v}_t = \frac{v_t}{1-\beta_2^t} θt+1=θtηv^t+εm^t\theta_{t+1} = \theta_t - \frac{\eta}{\sqrt{\hat{v}_t} + \varepsilon} \hat{m}_t

The effective step η/(v^t+ε)\eta / (\sqrt{\hat{v}_t} + \varepsilon) approximates a diagonal Hessian inverse with Hdiag(v^t)H \approx \text{diag}(\sqrt{\hat{v}_t}).

(a) Implement adam(grad_f, x0, lr=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, T=1000).

(b) Minimize f(x)=icixi2f(x) = \sum_i c_i x_i^2 where cic_i are log-uniformly spaced in [1,1000][1, 1000] (highly ill-conditioned diagonal quadratic). Compare Adam, SGD-with-momentum, and diagonal Newton (θt+1=θtgt/(2ci)\theta_{t+1} = \theta_t - g_t / (2c_i)) after 500 steps.

(c) Show that Adam's effective per-coordinate learning rate η/v^t\eta / \sqrt{\hat{v}_t} converges to 1/2ci1 / \sqrt{2c_i} (approximately the Newton step). Print the ratio effective_lr / newton_lr for each coordinate at convergence.

Code cell 20

# Your Solution
# Exercise 6 — Scaffold

def adam(grad_f, x0, lr=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, T=1000):
    """
    Adam optimizer.
    grad_f: function returning gradient
    x0: initial point
    Returns: (final x, loss history)
    """
    x = x0.copy().astype(float)

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 21

# Solution
def adam_sol(grad_f, x0, lr=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, T=1000):
    x = x0.copy().astype(float)
    m = np.zeros_like(x)
    v = np.zeros_like(x)
    fvals = []
    for t in range(1, T + 1):
        g = grad_f(x)
        m = beta1 * m + (1 - beta1) * g
        v = beta2 * v + (1 - beta2) * g**2
        m_hat = m / (1 - beta1**t)
        v_hat = v / (1 - beta2**t)
        x = x - lr * m_hat / (np.sqrt(v_hat) + eps)
        fvals.append(np.sum(c * x**2))
    return x, fvals

d = 20
c = np.logspace(0, 3, d)
f6 = lambda x: np.sum(c * x**2)
gf6 = lambda x: 2 * c * x
x0 = np.ones(d)

x_adam, fvals_adam = adam_sol(gf6, x0, lr=1e-2, T=2000)

# SGD + momentum
x_sgd = x0.copy()
v_sgd = np.zeros(d)
fvals_sgd = []
for t in range(2000):
    g = gf6(x_sgd)
    v_sgd = 0.9 * v_sgd - 0.001 * g
    x_sgd += v_sgd
    fvals_sgd.append(f6(x_sgd))

# Diagonal Newton (exact for this quadratic)
x_dn = x0.copy()
fvals_dn = []
for t in range(2000):
    g = gf6(x_dn)
    x_dn = x_dn - g / (2 * c)  # Exact Newton step for f=c*x^2
    fvals_dn.append(f6(x_dn))

print('After 2000 steps:')
print(f'  Adam:            f = {fvals_adam[-1]:.4e}')
print(f'  SGD+momentum:    f = {fvals_sgd[-1]:.4e}')
print(f'  Diagonal Newton: f = {fvals_dn[-1]:.4e}')

# (c) Effective learning rate analysis
# Re-run Adam and inspect v_hat at convergence
m2 = np.zeros(d); v2 = np.zeros(d)
x2 = x0.copy()
beta1, beta2, eps = 0.9, 0.999, 1e-8
for t in range(1, 5001):
    g = gf6(x2)
    m2 = beta1*m2 + (1-beta1)*g
    v2 = beta2*v2 + (1-beta2)*g**2
    m_hat = m2/(1-beta1**t)
    v_hat = v2/(1-beta2**t)
    x2 = x2 - 1e-2 * m_hat / (np.sqrt(v_hat) + eps)
effective_lr = 1e-2 / (np.sqrt(v_hat) + eps)
newton_lr = 1.0 / (2*c)
print('\nEffective LR / Newton LR (first 5 coordinates):')
print((effective_lr / newton_lr)[:5].round(4))

Exercise 7 ★★ — Mixed-Precision Gradient Accumulation

Modern LLM training uses mixed precision: forward and backward passes in bf16/fp16, but gradient accumulation and parameter updates in fp32. Without loss scaling, fp16 gradients underflow to zero.

(a) Simulate fp16 gradient underflow: compute gradients for a simple network where the true gradient has magnitude 10510^{-5} (below fp16 minimum normal 6×105\approx 6 \times 10^{-5}). Show that np.float16(grad) rounds to zero but np.float32(grad) does not.

(b) Implement loss scaling: multiply the loss by S=215S = 2^{15}, compute gradients, then divide by SS before the weight update. Verify the unscaled gradient matches fp32.

(c) Implement a simplified dynamic loss scaler that:

  • Doubles SS every 2000 steps if no overflow
  • Halves SS and skips the update when inf or nan appears in scaled gradients Simulate 100 steps where a random 5% of steps would overflow. Print the scale trajectory.

Code cell 23

# Your Solution
# Exercise 7 — Scaffold

# (a) fp16 underflow
grad_true = 3e-5  # True gradient magnitude

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 24

# Solution
# (a) fp16 underflow demonstration
grad_true = 3e-5
g_fp16 = np.float16(grad_true)
g_fp32 = np.float32(grad_true)
print(f'True gradient:   {grad_true}')
print(f'fp16 gradient:   {g_fp16}  (underflows to {g_fp16:.6f})')
print(f'fp32 gradient:   {g_fp32}')
print(f'fp16 == 0: {g_fp16 == 0}')
print(f'fp32 == 0: {g_fp32 == 0}')

# (b) Loss scaling
S = np.float32(2**15)
scaled_loss = np.float32(grad_true) * S
scaled_g_fp16 = np.float16(scaled_loss)  # Now representable in fp16
unscaled = np.float32(scaled_g_fp16) / S
print(f'\nWith scale {S:.0f}:')
print(f'  Scaled grad (fp16): {scaled_g_fp16}')
print(f'  Unscaled:           {unscaled}')
print(f'  Matches fp32:       {abs(unscaled - g_fp32) < 1e-6}')

# (c) Dynamic loss scaler
class DynamicLossScalerSol:
    def __init__(self, init_scale=2**15, growth_interval=2000, growth_factor=2.0, backoff_factor=0.5):
        self.scale = float(init_scale)
        self.growth_interval = growth_interval
        self.growth_factor = growth_factor
        self.backoff_factor = backoff_factor
        self.steps_since_overflow = 0

    def step(self, grads_ok):
        """grads_ok: True if no inf/nan in scaled gradients."""
        if not grads_ok:
            self.scale *= self.backoff_factor
            self.steps_since_overflow = 0
            return False  # Skip update
        self.steps_since_overflow += 1
        if self.steps_since_overflow >= self.growth_interval:
            self.scale *= self.growth_factor
            self.steps_since_overflow = 0
        return True

np.random.seed(0)
scaler = DynamicLossScalerSol(init_scale=2**10, growth_interval=20)
scales = [scaler.scale]
skipped = 0
for step in range(100):
    overflow = np.random.random() < 0.05  # 5% overflow rate
    ok = scaler.step(not overflow)
    if not ok:
        skipped += 1
    scales.append(scaler.scale)

print(f'\nDynamic scaler over 100 steps:')
print(f'  Skipped updates: {skipped}')
print(f'  Final scale:     {scales[-1]:.0f}')
print(f'  Scale trajectory (every 20 steps): {[f"{s:.0f}" for s in scales[::20]]}')

Exercise 8 ★★★ — Hessian-Vector Products and Curvature Analysis

Computing the full Hessian is O(d2)O(d^2) in memory. A Hessian-vector product (HVP) HvH v can be computed in O(d)O(d) via:

Hv=x[f(x)v]=limε0f(x+εv)f(x)εHv = \nabla_x [\nabla f(x)^\top v] = \lim_{\varepsilon \to 0} \frac{\nabla f(x + \varepsilon v) - \nabla f(x)}{\varepsilon}

(a) Implement hvp_finite_diff(grad_f, x, v, eps=1e-5) using the forward difference approximation of the HVP.

(b) Verify correctness: for f(x)=12xAxf(x) = \frac{1}{2} x^\top A x, Hv=AvHv = Av. Test with a random PD matrix AR50×50A \in \mathbb{R}^{50 \times 50}.

(c) Use the power method with HVPs to estimate the largest eigenvalue λmax(H)\lambda_{\max}(H) of the Hessian at the Rosenbrock minimum (1,1)(1,1). Compare to np.linalg.eigvalsh(rosenbrock_hess([1,1])).

(d) Implement one step of Lanczos iteration using HVPs to estimate both λmax\lambda_{\max} and λmin\lambda_{\min} without forming HH explicitly. Report the condition number κ(H)=λmax/λmin\kappa(H) = \lambda_{\max} / \lambda_{\min}.

Code cell 26

# Your Solution
# Exercise 8 — Scaffold

def hvp_finite_diff(grad_f, x, v, eps=1e-5):
    """
    Hessian-vector product via finite differences.
    Returns H(x) @ v.
    """

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 27

# Solution
def hvp_finite_diff_sol(grad_f, x, v, eps=1e-5):
    return (grad_f(x + eps * v) - grad_f(x)) / eps

# (b) Verification
np.random.seed(42)
n = 50
M = np.random.randn(n, n)
A_pd = M.T @ M / n + np.eye(n) * 0.1  # Positive definite
b_pd = np.random.randn(n)
f_quad_nd = lambda x: 0.5 * x @ A_pd @ x - b_pd @ x
gf_quad_nd = lambda x: A_pd @ x - b_pd
x_test = np.random.randn(n)
v_test = np.random.randn(n)

hvp_fd = hvp_finite_diff_sol(gf_quad_nd, x_test, v_test)
hvp_exact = A_pd @ v_test
err = np.linalg.norm(hvp_fd - hvp_exact) / np.linalg.norm(hvp_exact)
print(f'HVP relative error: {err:.2e}')
ok = err < 1e-4
print(f'PASS: HVP finite diff accurate' if ok else f'FAIL: error = {err:.2e}')

# (c) Power method at Rosenbrock minimum
x_min = np.array([1.0, 1.0])
v = np.random.randn(2)
v /= np.linalg.norm(v)
lambda_est = 0.0
for _ in range(100):
    Hv = hvp_finite_diff_sol(rosenbrock_grad, x_min, v)
    lambda_est = np.linalg.norm(Hv)
    v = Hv / lambda_est

H_exact = rosenbrock_hess(x_min)
evals_exact = np.sort(np.linalg.eigvalsh(H_exact))[::-1]
print(f'\nPower method lambda_max: {lambda_est:.6f}')
print(f'Exact lambda_max:        {evals_exact[0]:.6f}')
print(f'PASS' if abs(lambda_est - evals_exact[0])/evals_exact[0] < 1e-4 else 'FAIL')

# (d) Lanczos iteration
def lanczos_hvp_sol(grad_f, x, k=10):
    n = len(x)
    q = np.random.randn(n); q /= np.linalg.norm(q)
    Q = [q]
    alphas, betas = [], []
    for j in range(k):
        z = hvp_finite_diff_sol(grad_f, x, Q[-1])
        a = Q[-1] @ z
        alphas.append(a)
        if j == 0:
            r = z - a * Q[-1]
        else:
            r = z - a * Q[-1] - betas[-1] * Q[-2]
        b = np.linalg.norm(r)
        betas.append(b)
        if b < 1e-12:
            break
        Q.append(r / b)
    T = np.diag(alphas) + np.diag(betas[:-1], 1) + np.diag(betas[:-1], -1)
    evals = np.linalg.eigvalsh(T)
    return evals.max(), evals.min()

lam_max, lam_min = lanczos_hvp_sol(rosenbrock_grad, x_min, k=2)
kappa_est = lam_max / max(abs(lam_min), 1e-10)
kappa_exact = evals_exact[0] / evals_exact[-1]
print(f'\nLanczos: lambda_max={lam_max:.4f}, lambda_min={lam_min:.4f}')
print(f'Estimated condition number: {kappa_est:.2f}')
print(f'Exact condition number:     {kappa_exact:.2f}')

Exercise 9 *** - Gradient Clipping as a Trust Region

Compare raw gradient descent with clipped updates on a steep quadratic.

Code cell 29

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 30

# Solution
H=np.diag([100.,1.]); theta=np.array([5.,5.]); eta=0.05
def loss(t): return 0.5*t@(H@t)
def step(t,clip=None):
    g=H@t; upd=-eta*g
    if clip is not None and np.linalg.norm(upd)>clip: upd=upd/np.linalg.norm(upd)*clip
    return t+upd
raw=theta.copy(); clipped=theta.copy()
for _ in range(20): raw=step(raw); clipped=step(clipped,clip=0.5)
header('Exercise 9: Gradient Clipping')
print(f'raw loss={loss(raw):.3e}, clipped loss={loss(clipped):.3e}')
check_true('clipped path remains finite and improves', np.isfinite(loss(clipped)) and loss(clipped)<loss(theta))
print('Takeaway: clipping behaves like a simple trust-region constraint on update size.')

Exercise 10 *** - Hessian-Vector Products by Finite Differences

Approximate HvHv without forming HH using a gradient finite difference.

Code cell 32

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 33

# Solution
Q=np.array([[5.,1.],[1.,2.]]); b=np.array([1.,-3.])
def grad(x): return Q@x+b
x=np.array([0.7,-1.2]); v=np.array([2.,-0.5]); eps=1e-5
hvp_fd=(grad(x+eps*v)-grad(x))/eps
hvp_true=Q@v
header('Exercise 10: Hessian-Vector Products')
print('finite diff:',hvp_fd,'true:',hvp_true)
check_true('HVP finite difference is accurate', np.allclose(hvp_fd,hvp_true,atol=1e-7))
print('Takeaway: Hessian-vector products enable second-order methods without materializing dense Hessians.')