Theory NotebookMath for LLMs

Numerical Optimization

Numerical Methods / Numerical Optimization

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Numerical Optimization — Theory Notebook

"In theory there is no difference between theory and practice. In practice there is."

Interactive derivations: line search, Newton's method, L-BFGS, trust region, finite differences, Adam as diagonal quasi-Newton.

Companion: notes.md | exercises.ipynb

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import scipy.optimize as sopt
import numpy.linalg as nla

try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    try:
        import seaborn as sns
        sns.set_theme(style='whitegrid', palette='colorblind')
        HAS_SNS = True
    except ImportError:
        plt.style.use('seaborn-v0_8-whitegrid')
        HAS_SNS = False
    mpl.rcParams.update({
        'figure.figsize': (10, 6), 'figure.dpi': 120,
        'font.size': 13, 'axes.titlesize': 15,
        'axes.spines.top': False, 'axes.spines.right': False,
    })
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

COLORS = {
    'primary':   '#0077BB',
    'secondary': '#EE7733',
    'tertiary':  '#009988',
    'error':     '#CC3311',
    'neutral':   '#555555',
}

np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
print('Setup complete.')

print("Chapter helper setup complete.")

1. Test Functions

Standard optimization test functions used throughout this notebook.

Code cell 5

# === 1.1 Test Functions ===

def rosenbrock(x):
    return 100 * (x[1] - x[0]**2)**2 + (1 - x[0])**2

def rosenbrock_grad(x):
    g = np.zeros(2)
    g[0] = -400 * x[0] * (x[1] - x[0]**2) - 2 * (1 - x[0])
    g[1] = 200 * (x[1] - x[0]**2)
    return g

def rosenbrock_hess(x):
    H = np.zeros((2, 2))
    H[0, 0] = -400 * (x[1] - x[0]**2) + 800 * x[0]**2 + 2
    H[0, 1] = -400 * x[0]
    H[1, 0] = -400 * x[0]
    H[1, 1] = 200
    return H

def quad(H_diag):
    """Quadratic f(x) = 0.5 x^T diag(H_diag) x."""
    def f(x): return 0.5 * np.sum(H_diag * x**2)
    def g(x): return H_diag * x
    def h(x): return np.diag(H_diag)
    return f, g, h

# Verify at minimum
x_star = np.array([1.0, 1.0])
print(f'Rosenbrock at minimum (1,1): {rosenbrock(x_star)}')
print(f'Gradient at minimum: {rosenbrock_grad(x_star)}')
H_star = rosenbrock_hess(x_star)
kappa = nla.cond(H_star)
print(f'kappa(H*) = {kappa:.2f}')

2. Line Search Methods

Implement and compare Armijo backtracking and Wolfe conditions.

Code cell 7

# === 2.1 Backtracking Armijo ===

def armijo_search(f, g, x, p, alpha0=1.0, rho=0.5, c1=1e-4, max_iter=50):
    """Backtracking line search with Armijo (sufficient decrease) condition."""
    alpha = alpha0
    f0 = f(x)
    slope = g(x) @ p
    assert slope < 0, 'p must be a descent direction'

    for i in range(max_iter):
        if f(x + alpha * p) <= f0 + c1 * alpha * slope:
            break
        alpha *= rho

    return alpha, i + 1

# Test: gradient descent on Rosenbrock
x = np.array([-1.0, 0.5])
g = rosenbrock_grad(x)
p = -g  # Steepest descent direction

alpha_arm, n_evals = armijo_search(rosenbrock, rosenbrock_grad, x, p)
x_new = x + alpha_arm * p

print(f'Armijo step size: {alpha_arm:.6f} ({n_evals} evaluations)')
print(f'f(x) = {rosenbrock(x):.6f}')
print(f'f(x + alpha*p) = {rosenbrock(x_new):.6f}')
print(f'Sufficient decrease satisfied: {rosenbrock(x_new) <= rosenbrock(x) + 1e-4 * alpha_arm * (g @ p)}')

Code cell 8

# === 2.2 Gradient Descent with Armijo Line Search ===

def gradient_descent_ls(f, grad_f, x0, tol=1e-8, max_iter=10000):
    """GD with Armijo backtracking."""
    x = x0.copy()
    history = [x.copy()]
    losses = [f(x)]

    for k in range(max_iter):
        g = grad_f(x)
        if nla.norm(g) < tol:
            break
        p = -g
        alpha, _ = armijo_search(f, grad_f, x, p)
        x = x + alpha * p
        history.append(x.copy())
        losses.append(f(x))

    return x, history, losses

x0 = np.array([-1.0, 0.5])
x_gd, hist_gd, loss_gd = gradient_descent_ls(rosenbrock, rosenbrock_grad, x0, tol=1e-6)
print(f'GD with Armijo: {len(loss_gd)} iterations')
print(f'Solution: {x_gd}')
print(f'Final loss: {loss_gd[-1]:.2e}')

3. Newton's Method

Newton's method achieves quadratic convergence near a non-degenerate minimum.

Code cell 10

# === 3.1 Newton's Method with Line Search ===

def newton_ls(f, grad_f, hess_f, x0, tol=1e-10, max_iter=200):
    """Newton's method with Armijo backtracking."""
    x = x0.copy()
    history = [x.copy()]
    losses = [f(x)]

    for k in range(max_iter):
        g = grad_f(x)
        if nla.norm(g) < tol:
            break

        H = hess_f(x)
        # Modified Cholesky: add tau*I if H is not positive definite
        try:
            L = nla.cholesky(H)
            p = -nla.solve(H, g)  # Newton direction
        except nla.LinAlgError:
            p = -g  # Fall back to gradient descent if H is indefinite

        # Ensure descent direction
        if grad_f(x) @ p >= 0:
            p = -g

        alpha, _ = armijo_search(f, grad_f, x, p)
        x = x + alpha * p
        history.append(x.copy())
        losses.append(f(x))

    return x, history, losses

x0 = np.array([-1.0, 0.5])
x_nt, hist_nt, loss_nt = newton_ls(rosenbrock, rosenbrock_grad, rosenbrock_hess, x0, tol=1e-10)
print(f'Newton: {len(loss_nt)} iterations')
print(f'Solution: {x_nt}')
print(f'Final loss: {loss_nt[-1]:.2e}')

ok = nla.norm(x_nt - [1.0, 1.0]) < 1e-6
print(f"{'PASS' if ok else 'FAIL'} - Newton converged to correct minimum")

Code cell 11

# === 3.2 Quadratic Convergence Demonstration ===

# On a quadratic, Newton converges in ONE step
H_diag = np.array([2.0, 10.0, 50.0, 100.0])
kappa = H_diag.max() / H_diag.min()
f_quad, g_quad, h_quad = quad(H_diag)

x0 = np.ones(4)

# Newton step
x_newt = x0 - nla.solve(h_quad(x0), g_quad(x0))
print(f'Newton on quadratic: one step result = {x_newt}')
print(f'(Should be [0,0,0,0])')
print(f'Loss after one Newton step: {f_quad(x_newt):.2e}')

# GD convergence on same problem
eta_opt = 2 / (H_diag.max() + H_diag.min())
rate_gd = (kappa - 1) / (kappa + 1)
print(f'\nGD convergence rate: {rate_gd:.4f}')
print(f'GD steps to 1e-8: ~{int(-8 / np.log10(rate_gd)) + 1}')
print(f'Newton steps: 1 (exact for quadratic!)')

4. L-BFGS: Limited Memory BFGS

Implement the two-loop recursion and demonstrate super-linear convergence.

Code cell 13

# === 4.1 L-BFGS Two-Loop Recursion ===

def lbfgs_direction(g, s_list, y_list):
    """Compute L-BFGS search direction via two-loop recursion."""
    m = len(s_list)

    if m == 0:
        return -g  # Initial step: steepest descent

    rho = [1.0 / (y @ s) for s, y in zip(s_list, y_list)]
    q = g.copy()
    alpha = np.zeros(m)

    # First loop (backward)
    for i in range(m - 1, -1, -1):
        alpha[i] = rho[i] * (s_list[i] @ q)
        q -= alpha[i] * y_list[i]

    # Scale by initial Hessian (Barzilai-Borwein)
    s_last, y_last = s_list[-1], y_list[-1]
    gamma = (s_last @ y_last) / (y_last @ y_last)
    r = gamma * q

    # Second loop (forward)
    for i in range(m):
        beta = rho[i] * (y_list[i] @ r)
        r += s_list[i] * (alpha[i] - beta)

    return -r

# Full L-BFGS optimizer
def lbfgs(f, grad_f, x0, m=10, tol=1e-10, max_iter=500):
    x = x0.copy()
    s_list, y_list = [], []
    losses = [f(x)]
    g = grad_f(x)

    for k in range(max_iter):
        if nla.norm(g) < tol:
            break

        p = lbfgs_direction(g, s_list, y_list)

        # Line search
        alpha, _ = armijo_search(f, grad_f, x, p, alpha0=1.0)

        x_new = x + alpha * p
        g_new = grad_f(x_new)

        s = x_new - x
        y = g_new - g
        if y @ s > 1e-10:  # Only update if secant condition holds
            s_list.append(s)
            y_list.append(y)
            if len(s_list) > m:
                s_list.pop(0)
                y_list.pop(0)

        x, g = x_new, g_new
        losses.append(f(x))

    return x, losses

x0 = np.array([-1.0, 0.5])
x_lb, loss_lb = lbfgs(rosenbrock, rosenbrock_grad, x0, m=10, tol=1e-10)
print(f'L-BFGS: {len(loss_lb)} iterations')
print(f'Solution: {x_lb}')
print(f'Final loss: {loss_lb[-1]:.2e}')

ok = nla.norm(x_lb - [1.0, 1.0]) < 1e-5
print(f"{'PASS' if ok else 'FAIL'} - L-BFGS converged")

5. Optimizer Comparison

Compare GD, Newton, and L-BFGS on the Rosenbrock function.

Code cell 15

# === 5.1 Convergence Comparison ===

x0 = np.array([-1.0, 0.5])

# GD with line search
_, _, loss_gd = gradient_descent_ls(rosenbrock, rosenbrock_grad, x0,
                                     tol=1e-10, max_iter=20000)

# Newton
_, _, loss_nt = newton_ls(rosenbrock, rosenbrock_grad, rosenbrock_hess, x0,
                           tol=1e-10, max_iter=200)

# L-BFGS
_, loss_lb = lbfgs(rosenbrock, rosenbrock_grad, x0, m=10, tol=1e-10, max_iter=500)

# Scipy L-BFGS-B (reference)
res = sopt.minimize(rosenbrock, x0, jac=rosenbrock_grad, method='L-BFGS-B',
                    options={'maxiter': 500, 'ftol': 1e-15, 'gtol': 1e-10})
print(f'GD iterations:     {len(loss_gd)-1:6d}')
print(f'Newton iterations: {len(loss_nt)-1:6d}')
print(f'L-BFGS iterations: {len(loss_lb)-1:6d}')
print(f'Scipy L-BFGS-B:    {res.nit:6d}')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(loss_gd[:500], color=COLORS['error'], label='Gradient Descent', alpha=0.8)
    ax.semilogy(loss_nt, color=COLORS['secondary'], label='Newton')
    ax.semilogy(loss_lb, color=COLORS['primary'], label='L-BFGS (m=10)')
    ax.set_xlabel('Iteration'); ax.set_ylabel('f(x) (log scale)')
    ax.set_title('Optimizer comparison on Rosenbrock function')
    ax.legend()
    plt.tight_layout(); plt.show()
    print('Plot displayed.')

6. Finite Differences and Gradient Checking

Analyze the accuracy of finite difference approximations and find optimal step sizes.

Code cell 17

# === 6.1 Finite Difference Accuracy vs Step Size ===

def f_test(x): return np.sin(x**2) * np.exp(-x)
def f_grad_true(x): return (2*x*np.cos(x**2) - np.sin(x**2)) * np.exp(-x)

x0 = 1.5
true_grad = f_grad_true(x0)

h_vals = np.logspace(-15, -1, 50)
err_forward  = []
err_centered = []

for h in h_vals:
    g_fwd = (f_test(x0 + h) - f_test(x0)) / h
    g_ctr = (f_test(x0 + h) - f_test(x0 - h)) / (2*h)
    err_forward.append(abs(g_fwd - true_grad))
    err_centered.append(abs(g_ctr - true_grad))

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.loglog(h_vals, err_forward, color=COLORS['error'], label='Forward difference')
    ax.loglog(h_vals, err_centered, color=COLORS['primary'], label='Centered difference')
    h_opt_fwd = np.sqrt(np.finfo(np.float64).eps)
    h_opt_ctr = np.cbrt(np.finfo(np.float64).eps)
    ax.axvline(h_opt_fwd, color='red', linestyle=':', label=f'Optimal h (fwd) ≈ {h_opt_fwd:.1e}')
    ax.axvline(h_opt_ctr, color='blue', linestyle=':', label=f'Optimal h (ctr) ≈ {h_opt_ctr:.1e}')
    ax.set_xlabel('h'); ax.set_ylabel('|error|')
    ax.set_title('Finite difference error vs step size h')
    ax.legend(fontsize=9)
    plt.tight_layout(); plt.show()
    print('Plot displayed.')

# Find empirical optimal
h_opt_fwd_emp = h_vals[np.argmin(err_forward)]
h_opt_ctr_emp = h_vals[np.argmin(err_centered)]
print(f'Empirical optimal h (forward):  {h_opt_fwd_emp:.2e}  (theory: {np.sqrt(np.finfo(np.float64).eps):.2e})')
print(f'Empirical optimal h (centered): {h_opt_ctr_emp:.2e}  (theory: {np.cbrt(np.finfo(np.float64).eps):.2e})')

Code cell 18

# === 6.2 Complex-Step Derivative ===

def complex_step_grad(f_complex, x, h=1e-100):
    """Complex-step derivative: Im[f(x + ih)] / h."""
    return np.imag(f_complex(x + 1j * h)) / h

# f must be defined for complex arguments
def f_complex(x): return np.sin(x**2) * np.exp(-x)

x0 = 1.5
true_grad = f_grad_true(x0)
complex_grad = complex_step_grad(f_complex, x0)

print('Complex-step derivative:')
print(f'  True gradient:    {true_grad:.15f}')
print(f'  Complex step:     {complex_grad:.15f}')
print(f'  Error:            {abs(complex_grad - true_grad):.2e}')
print(f'  (centered diff):  {abs((f_test(x0+1e-6) - f_test(x0-1e-6))/(2e-6) - true_grad):.2e}')

ok = abs(complex_grad - true_grad) < 1e-14
print(f"{'PASS' if ok else 'FAIL'} - complex step achieves near machine precision")

7. Adam as Diagonal Quasi-Newton

Demonstrate that Adam approximates diagonal preconditioning.

Code cell 20

# === 7.1 Convergence on Ill-Conditioned Quadratic ===

h_diag = np.array([1.0, 10.0, 100.0, 1000.0])
kappa = h_diag.max() / h_diag.min()
d = len(h_diag)
x0 = np.ones(d)
n_steps = 500

# === Gradient Descent ===
eta_gd = 2.0 / (h_diag.max() + h_diag.min())
x_gd = x0.copy()
err_gd = [nla.norm(x_gd)]
for _ in range(n_steps):
    g = h_diag * x_gd
    x_gd -= eta_gd * g
    err_gd.append(nla.norm(x_gd))

# === Adam ===
beta1, beta2, eps_a = 0.9, 0.999, 1e-8
alpha_adam = 0.1
x_adam = x0.copy()
m_adam, v_adam = np.zeros(d), np.zeros(d)
err_adam = [nla.norm(x_adam)]
for t in range(1, n_steps + 1):
    g = h_diag * x_adam
    m_adam = beta1 * m_adam + (1 - beta1) * g
    v_adam = beta2 * v_adam + (1 - beta2) * g**2
    m_hat = m_adam / (1 - beta1**t)
    v_hat = v_adam / (1 - beta2**t)
    x_adam -= alpha_adam * m_hat / (np.sqrt(v_hat) + eps_a)
    err_adam.append(nla.norm(x_adam))

# === Diagonal Preconditioned GD (optimal) ===
x_pc = x0.copy()
err_pc = [nla.norm(x_pc)]
for _ in range(n_steps):
    g = h_diag * x_pc
    x_pc -= g / h_diag  # Exact diagonal Newton step
    err_pc.append(nla.norm(x_pc))

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.semilogy(err_gd, color=COLORS['error'], label=f'Gradient descent (κ={int(kappa)})')
    ax.semilogy(err_adam, color=COLORS['secondary'], label='Adam')
    ax.semilogy(err_pc, color=COLORS['primary'], label='Diagonal precond GD (Newton)')
    ax.set_xlabel('Iteration'); ax.set_ylabel('||x|| (log)')
    ax.set_title('Adam as diagonal quasi-Newton: convergence comparison')
    ax.legend()
    plt.tight_layout(); plt.show()

print(f'GD final error:     {err_gd[-1]:.4f}')
print(f'Adam final error:   {err_adam[-1]:.4f}')
print(f'Diag precond:       {err_pc[-1]:.4e}')

ok = err_adam[-1] < err_gd[-1]
print(f"{'PASS' if ok else 'FAIL'} - Adam beats GD on ill-conditioned problem")

Summary

AlgorithmMemoryConvergenceUse case
Gradient descentO(d)O(d)Linear: (κ1)/(κ+1)k(κ-1)/(κ+1)^kBaseline
CGO(d)O(d)Linear: (κ1)/(κ+1)k(√κ-1)/(√κ+1)^kSPD systems
L-BFGS (mm vectors)O(md)O(md)Super-linearFull-batch
NewtonO(d2)O(d^2)QuadraticSmall-scale
AdamO(d)O(d)Empirically fastStochastic
Diagonal precond GDO(d)O(d)Linear, rate O(1)O(1) for diag HessianConvex with known Hessian

Next: Interpolation and Approximation →