Theory Notebook
Converted from
theory.ipynbfor web reading.
Numerical Optimization — Theory Notebook
"In theory there is no difference between theory and practice. In practice there is."
Interactive derivations: line search, Newton's method, L-BFGS, trust region, finite differences, Adam as diagonal quasi-Newton.
Companion: notes.md | exercises.ipynb
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import scipy.optimize as sopt
import numpy.linalg as nla
try:
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style='whitegrid', palette='colorblind')
HAS_SNS = True
except ImportError:
plt.style.use('seaborn-v0_8-whitegrid')
HAS_SNS = False
mpl.rcParams.update({
'figure.figsize': (10, 6), 'figure.dpi': 120,
'font.size': 13, 'axes.titlesize': 15,
'axes.spines.top': False, 'axes.spines.right': False,
})
HAS_MPL = True
except ImportError:
HAS_MPL = False
COLORS = {
'primary': '#0077BB',
'secondary': '#EE7733',
'tertiary': '#009988',
'error': '#CC3311',
'neutral': '#555555',
}
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
print('Setup complete.')
print("Chapter helper setup complete.")
1. Test Functions
Standard optimization test functions used throughout this notebook.
Code cell 5
# === 1.1 Test Functions ===
def rosenbrock(x):
return 100 * (x[1] - x[0]**2)**2 + (1 - x[0])**2
def rosenbrock_grad(x):
g = np.zeros(2)
g[0] = -400 * x[0] * (x[1] - x[0]**2) - 2 * (1 - x[0])
g[1] = 200 * (x[1] - x[0]**2)
return g
def rosenbrock_hess(x):
H = np.zeros((2, 2))
H[0, 0] = -400 * (x[1] - x[0]**2) + 800 * x[0]**2 + 2
H[0, 1] = -400 * x[0]
H[1, 0] = -400 * x[0]
H[1, 1] = 200
return H
def quad(H_diag):
"""Quadratic f(x) = 0.5 x^T diag(H_diag) x."""
def f(x): return 0.5 * np.sum(H_diag * x**2)
def g(x): return H_diag * x
def h(x): return np.diag(H_diag)
return f, g, h
# Verify at minimum
x_star = np.array([1.0, 1.0])
print(f'Rosenbrock at minimum (1,1): {rosenbrock(x_star)}')
print(f'Gradient at minimum: {rosenbrock_grad(x_star)}')
H_star = rosenbrock_hess(x_star)
kappa = nla.cond(H_star)
print(f'kappa(H*) = {kappa:.2f}')
2. Line Search Methods
Implement and compare Armijo backtracking and Wolfe conditions.
Code cell 7
# === 2.1 Backtracking Armijo ===
def armijo_search(f, g, x, p, alpha0=1.0, rho=0.5, c1=1e-4, max_iter=50):
"""Backtracking line search with Armijo (sufficient decrease) condition."""
alpha = alpha0
f0 = f(x)
slope = g(x) @ p
assert slope < 0, 'p must be a descent direction'
for i in range(max_iter):
if f(x + alpha * p) <= f0 + c1 * alpha * slope:
break
alpha *= rho
return alpha, i + 1
# Test: gradient descent on Rosenbrock
x = np.array([-1.0, 0.5])
g = rosenbrock_grad(x)
p = -g # Steepest descent direction
alpha_arm, n_evals = armijo_search(rosenbrock, rosenbrock_grad, x, p)
x_new = x + alpha_arm * p
print(f'Armijo step size: {alpha_arm:.6f} ({n_evals} evaluations)')
print(f'f(x) = {rosenbrock(x):.6f}')
print(f'f(x + alpha*p) = {rosenbrock(x_new):.6f}')
print(f'Sufficient decrease satisfied: {rosenbrock(x_new) <= rosenbrock(x) + 1e-4 * alpha_arm * (g @ p)}')
Code cell 8
# === 2.2 Gradient Descent with Armijo Line Search ===
def gradient_descent_ls(f, grad_f, x0, tol=1e-8, max_iter=10000):
"""GD with Armijo backtracking."""
x = x0.copy()
history = [x.copy()]
losses = [f(x)]
for k in range(max_iter):
g = grad_f(x)
if nla.norm(g) < tol:
break
p = -g
alpha, _ = armijo_search(f, grad_f, x, p)
x = x + alpha * p
history.append(x.copy())
losses.append(f(x))
return x, history, losses
x0 = np.array([-1.0, 0.5])
x_gd, hist_gd, loss_gd = gradient_descent_ls(rosenbrock, rosenbrock_grad, x0, tol=1e-6)
print(f'GD with Armijo: {len(loss_gd)} iterations')
print(f'Solution: {x_gd}')
print(f'Final loss: {loss_gd[-1]:.2e}')
3. Newton's Method
Newton's method achieves quadratic convergence near a non-degenerate minimum.
Code cell 10
# === 3.1 Newton's Method with Line Search ===
def newton_ls(f, grad_f, hess_f, x0, tol=1e-10, max_iter=200):
"""Newton's method with Armijo backtracking."""
x = x0.copy()
history = [x.copy()]
losses = [f(x)]
for k in range(max_iter):
g = grad_f(x)
if nla.norm(g) < tol:
break
H = hess_f(x)
# Modified Cholesky: add tau*I if H is not positive definite
try:
L = nla.cholesky(H)
p = -nla.solve(H, g) # Newton direction
except nla.LinAlgError:
p = -g # Fall back to gradient descent if H is indefinite
# Ensure descent direction
if grad_f(x) @ p >= 0:
p = -g
alpha, _ = armijo_search(f, grad_f, x, p)
x = x + alpha * p
history.append(x.copy())
losses.append(f(x))
return x, history, losses
x0 = np.array([-1.0, 0.5])
x_nt, hist_nt, loss_nt = newton_ls(rosenbrock, rosenbrock_grad, rosenbrock_hess, x0, tol=1e-10)
print(f'Newton: {len(loss_nt)} iterations')
print(f'Solution: {x_nt}')
print(f'Final loss: {loss_nt[-1]:.2e}')
ok = nla.norm(x_nt - [1.0, 1.0]) < 1e-6
print(f"{'PASS' if ok else 'FAIL'} - Newton converged to correct minimum")
Code cell 11
# === 3.2 Quadratic Convergence Demonstration ===
# On a quadratic, Newton converges in ONE step
H_diag = np.array([2.0, 10.0, 50.0, 100.0])
kappa = H_diag.max() / H_diag.min()
f_quad, g_quad, h_quad = quad(H_diag)
x0 = np.ones(4)
# Newton step
x_newt = x0 - nla.solve(h_quad(x0), g_quad(x0))
print(f'Newton on quadratic: one step result = {x_newt}')
print(f'(Should be [0,0,0,0])')
print(f'Loss after one Newton step: {f_quad(x_newt):.2e}')
# GD convergence on same problem
eta_opt = 2 / (H_diag.max() + H_diag.min())
rate_gd = (kappa - 1) / (kappa + 1)
print(f'\nGD convergence rate: {rate_gd:.4f}')
print(f'GD steps to 1e-8: ~{int(-8 / np.log10(rate_gd)) + 1}')
print(f'Newton steps: 1 (exact for quadratic!)')
4. L-BFGS: Limited Memory BFGS
Implement the two-loop recursion and demonstrate super-linear convergence.
Code cell 13
# === 4.1 L-BFGS Two-Loop Recursion ===
def lbfgs_direction(g, s_list, y_list):
"""Compute L-BFGS search direction via two-loop recursion."""
m = len(s_list)
if m == 0:
return -g # Initial step: steepest descent
rho = [1.0 / (y @ s) for s, y in zip(s_list, y_list)]
q = g.copy()
alpha = np.zeros(m)
# First loop (backward)
for i in range(m - 1, -1, -1):
alpha[i] = rho[i] * (s_list[i] @ q)
q -= alpha[i] * y_list[i]
# Scale by initial Hessian (Barzilai-Borwein)
s_last, y_last = s_list[-1], y_list[-1]
gamma = (s_last @ y_last) / (y_last @ y_last)
r = gamma * q
# Second loop (forward)
for i in range(m):
beta = rho[i] * (y_list[i] @ r)
r += s_list[i] * (alpha[i] - beta)
return -r
# Full L-BFGS optimizer
def lbfgs(f, grad_f, x0, m=10, tol=1e-10, max_iter=500):
x = x0.copy()
s_list, y_list = [], []
losses = [f(x)]
g = grad_f(x)
for k in range(max_iter):
if nla.norm(g) < tol:
break
p = lbfgs_direction(g, s_list, y_list)
# Line search
alpha, _ = armijo_search(f, grad_f, x, p, alpha0=1.0)
x_new = x + alpha * p
g_new = grad_f(x_new)
s = x_new - x
y = g_new - g
if y @ s > 1e-10: # Only update if secant condition holds
s_list.append(s)
y_list.append(y)
if len(s_list) > m:
s_list.pop(0)
y_list.pop(0)
x, g = x_new, g_new
losses.append(f(x))
return x, losses
x0 = np.array([-1.0, 0.5])
x_lb, loss_lb = lbfgs(rosenbrock, rosenbrock_grad, x0, m=10, tol=1e-10)
print(f'L-BFGS: {len(loss_lb)} iterations')
print(f'Solution: {x_lb}')
print(f'Final loss: {loss_lb[-1]:.2e}')
ok = nla.norm(x_lb - [1.0, 1.0]) < 1e-5
print(f"{'PASS' if ok else 'FAIL'} - L-BFGS converged")
5. Optimizer Comparison
Compare GD, Newton, and L-BFGS on the Rosenbrock function.
Code cell 15
# === 5.1 Convergence Comparison ===
x0 = np.array([-1.0, 0.5])
# GD with line search
_, _, loss_gd = gradient_descent_ls(rosenbrock, rosenbrock_grad, x0,
tol=1e-10, max_iter=20000)
# Newton
_, _, loss_nt = newton_ls(rosenbrock, rosenbrock_grad, rosenbrock_hess, x0,
tol=1e-10, max_iter=200)
# L-BFGS
_, loss_lb = lbfgs(rosenbrock, rosenbrock_grad, x0, m=10, tol=1e-10, max_iter=500)
# Scipy L-BFGS-B (reference)
res = sopt.minimize(rosenbrock, x0, jac=rosenbrock_grad, method='L-BFGS-B',
options={'maxiter': 500, 'ftol': 1e-15, 'gtol': 1e-10})
print(f'GD iterations: {len(loss_gd)-1:6d}')
print(f'Newton iterations: {len(loss_nt)-1:6d}')
print(f'L-BFGS iterations: {len(loss_lb)-1:6d}')
print(f'Scipy L-BFGS-B: {res.nit:6d}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.semilogy(loss_gd[:500], color=COLORS['error'], label='Gradient Descent', alpha=0.8)
ax.semilogy(loss_nt, color=COLORS['secondary'], label='Newton')
ax.semilogy(loss_lb, color=COLORS['primary'], label='L-BFGS (m=10)')
ax.set_xlabel('Iteration'); ax.set_ylabel('f(x) (log scale)')
ax.set_title('Optimizer comparison on Rosenbrock function')
ax.legend()
plt.tight_layout(); plt.show()
print('Plot displayed.')
6. Finite Differences and Gradient Checking
Analyze the accuracy of finite difference approximations and find optimal step sizes.
Code cell 17
# === 6.1 Finite Difference Accuracy vs Step Size ===
def f_test(x): return np.sin(x**2) * np.exp(-x)
def f_grad_true(x): return (2*x*np.cos(x**2) - np.sin(x**2)) * np.exp(-x)
x0 = 1.5
true_grad = f_grad_true(x0)
h_vals = np.logspace(-15, -1, 50)
err_forward = []
err_centered = []
for h in h_vals:
g_fwd = (f_test(x0 + h) - f_test(x0)) / h
g_ctr = (f_test(x0 + h) - f_test(x0 - h)) / (2*h)
err_forward.append(abs(g_fwd - true_grad))
err_centered.append(abs(g_ctr - true_grad))
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.loglog(h_vals, err_forward, color=COLORS['error'], label='Forward difference')
ax.loglog(h_vals, err_centered, color=COLORS['primary'], label='Centered difference')
h_opt_fwd = np.sqrt(np.finfo(np.float64).eps)
h_opt_ctr = np.cbrt(np.finfo(np.float64).eps)
ax.axvline(h_opt_fwd, color='red', linestyle=':', label=f'Optimal h (fwd) ≈ {h_opt_fwd:.1e}')
ax.axvline(h_opt_ctr, color='blue', linestyle=':', label=f'Optimal h (ctr) ≈ {h_opt_ctr:.1e}')
ax.set_xlabel('h'); ax.set_ylabel('|error|')
ax.set_title('Finite difference error vs step size h')
ax.legend(fontsize=9)
plt.tight_layout(); plt.show()
print('Plot displayed.')
# Find empirical optimal
h_opt_fwd_emp = h_vals[np.argmin(err_forward)]
h_opt_ctr_emp = h_vals[np.argmin(err_centered)]
print(f'Empirical optimal h (forward): {h_opt_fwd_emp:.2e} (theory: {np.sqrt(np.finfo(np.float64).eps):.2e})')
print(f'Empirical optimal h (centered): {h_opt_ctr_emp:.2e} (theory: {np.cbrt(np.finfo(np.float64).eps):.2e})')
Code cell 18
# === 6.2 Complex-Step Derivative ===
def complex_step_grad(f_complex, x, h=1e-100):
"""Complex-step derivative: Im[f(x + ih)] / h."""
return np.imag(f_complex(x + 1j * h)) / h
# f must be defined for complex arguments
def f_complex(x): return np.sin(x**2) * np.exp(-x)
x0 = 1.5
true_grad = f_grad_true(x0)
complex_grad = complex_step_grad(f_complex, x0)
print('Complex-step derivative:')
print(f' True gradient: {true_grad:.15f}')
print(f' Complex step: {complex_grad:.15f}')
print(f' Error: {abs(complex_grad - true_grad):.2e}')
print(f' (centered diff): {abs((f_test(x0+1e-6) - f_test(x0-1e-6))/(2e-6) - true_grad):.2e}')
ok = abs(complex_grad - true_grad) < 1e-14
print(f"{'PASS' if ok else 'FAIL'} - complex step achieves near machine precision")
7. Adam as Diagonal Quasi-Newton
Demonstrate that Adam approximates diagonal preconditioning.
Code cell 20
# === 7.1 Convergence on Ill-Conditioned Quadratic ===
h_diag = np.array([1.0, 10.0, 100.0, 1000.0])
kappa = h_diag.max() / h_diag.min()
d = len(h_diag)
x0 = np.ones(d)
n_steps = 500
# === Gradient Descent ===
eta_gd = 2.0 / (h_diag.max() + h_diag.min())
x_gd = x0.copy()
err_gd = [nla.norm(x_gd)]
for _ in range(n_steps):
g = h_diag * x_gd
x_gd -= eta_gd * g
err_gd.append(nla.norm(x_gd))
# === Adam ===
beta1, beta2, eps_a = 0.9, 0.999, 1e-8
alpha_adam = 0.1
x_adam = x0.copy()
m_adam, v_adam = np.zeros(d), np.zeros(d)
err_adam = [nla.norm(x_adam)]
for t in range(1, n_steps + 1):
g = h_diag * x_adam
m_adam = beta1 * m_adam + (1 - beta1) * g
v_adam = beta2 * v_adam + (1 - beta2) * g**2
m_hat = m_adam / (1 - beta1**t)
v_hat = v_adam / (1 - beta2**t)
x_adam -= alpha_adam * m_hat / (np.sqrt(v_hat) + eps_a)
err_adam.append(nla.norm(x_adam))
# === Diagonal Preconditioned GD (optimal) ===
x_pc = x0.copy()
err_pc = [nla.norm(x_pc)]
for _ in range(n_steps):
g = h_diag * x_pc
x_pc -= g / h_diag # Exact diagonal Newton step
err_pc.append(nla.norm(x_pc))
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.semilogy(err_gd, color=COLORS['error'], label=f'Gradient descent (κ={int(kappa)})')
ax.semilogy(err_adam, color=COLORS['secondary'], label='Adam')
ax.semilogy(err_pc, color=COLORS['primary'], label='Diagonal precond GD (Newton)')
ax.set_xlabel('Iteration'); ax.set_ylabel('||x|| (log)')
ax.set_title('Adam as diagonal quasi-Newton: convergence comparison')
ax.legend()
plt.tight_layout(); plt.show()
print(f'GD final error: {err_gd[-1]:.4f}')
print(f'Adam final error: {err_adam[-1]:.4f}')
print(f'Diag precond: {err_pc[-1]:.4e}')
ok = err_adam[-1] < err_gd[-1]
print(f"{'PASS' if ok else 'FAIL'} - Adam beats GD on ill-conditioned problem")
Summary
| Algorithm | Memory | Convergence | Use case |
|---|---|---|---|
| Gradient descent | Linear: | Baseline | |
| CG | Linear: | SPD systems | |
| L-BFGS ( vectors) | Super-linear | Full-batch | |
| Newton | Quadratic | Small-scale | |
| Adam | Empirically fast | Stochastic | |
| Diagonal precond GD | Linear, rate for diag Hessian | Convex with known Hessian |