Theory Notebook
Converted from
theory.ipynbfor web reading.
Optimality Conditions — Theory Notebook
"The art of optimization is knowing not just how to move, but when to stop."
Interactive theory: first-order conditions, second-order tests, convexity, Lagrange multipliers, KKT, duality, and deep learning landscape geometry.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize
try:
import torch
HAS_TORCH = True
except ImportError:
torch = None
HAS_TORCH = False
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def check_true(name, cond):
ok = bool(cond)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
return ok
def check(name, got, expected, tol=1e-8):
return check_close(name, got, expected, tol=tol)
def sigmoid(x):
x = np.asarray(x, dtype=float)
return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
z = z - np.max(z, axis=axis, keepdims=True)
e = np.exp(z)
return e / np.sum(e, axis=axis, keepdims=True)
def relu(x):
return np.maximum(0, x)
def relu_prime(x):
return np.where(np.asarray(x) > 0, 1.0, 0.0)
def centered_diff(f, x, h=1e-6):
return (f(x + h) - f(x - h)) / (2 * h)
def numerical_gradient(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
grad = np.zeros_like(x, dtype=float)
for idx in np.ndindex(x.shape):
xp = x.copy(); xm = x.copy()
xp[idx] += h; xm[idx] -= h
grad[idx] = (f(xp) - f(xm)) / (2*h)
return grad
def numerical_jacobian(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
y0 = np.asarray(f(x), dtype=float)
J = np.zeros((y0.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
return J.reshape(y0.shape + x.shape)
def grad_check(f, x, analytic_grad, h=1e-6):
numeric_grad = numerical_gradient(f, x, h=h)
denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
return la.norm(analytic_grad - numeric_grad) / denom
def jacobian_fd(f, x, h=1e-6):
x = np.asarray(x, dtype=float)
y0 = np.asarray(f(x), dtype=float)
J = np.zeros((y0.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
J[:, j] = (yp - ym) / (2*h)
return J.reshape(y0.shape + x.shape)
def hessian_fd(f, x, h=1e-5):
x = np.asarray(x, dtype=float)
H = np.zeros((x.size, x.size), dtype=float)
flat = x.reshape(-1)
for j in range(x.size):
xp = flat.copy(); xm = flat.copy()
xp[j] += h; xm[j] -= h
gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
H[:, j] = (gp - gm) / (2*h)
return H.reshape(x.shape + x.shape)
def grad_fd(f, x, h=1e-6):
return numerical_gradient(f, x, h=h)
def fd_grad(f, x, h=1e-6):
return numerical_gradient(f, np.asarray(x, dtype=float), h=h)
print("Chapter helper setup complete.")
1. First-Order Necessary Conditions
The gradient is the fundamental necessary condition for any local minimum. We visualise this and verify it for several functions.
Code cell 5
# === 1.1 Gradient at Critical Points ===
# Three canonical scalar functions and their critical points
def f_min(x): return x**2 # global min at x=0
def f_max(x): return -x**2 # global max at x=0
def f_saddle(x): return x**3 - 3*x # saddle at x=0 (inflection), min at x=1, max at x=-1
x_vals = np.linspace(-2, 2, 400)
# Find critical points numerically
from scipy.optimize import brentq
df_saddle = lambda x: 3*x**2 - 3 # derivative of x^3 - 3x
x_crit1 = brentq(df_saddle, -2, 0)
x_crit2 = brentq(df_saddle, 0, 2)
print(f'Critical points of f(x)=x^3-3x: x={x_crit1:.4f}, x={x_crit2:.4f}')
print(f'f\'(x_crit1) = {df_saddle(x_crit1):.2e} (should be ~0)')
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, f, title, crits in zip(
axes,
[f_min, f_max, f_saddle],
['f(x)=x² (minimum)', 'f(x)=-x² (maximum)', 'f(x)=x³-3x (two critical points)'],
[[0.0], [0.0], [x_crit1, x_crit2]]
):
ax.plot(x_vals, [f(x) for x in x_vals], 'b-', linewidth=2)
for xc in crits:
ax.axvline(xc, color='r', linestyle='--', alpha=0.5)
ax.plot(xc, f(xc), 'ro', markersize=8)
ax.set_title(title); ax.set_xlabel('x'); ax.set_ylabel('f(x)')
plt.suptitle('Critical Points: $\\nabla f = 0$', fontsize=14)
plt.tight_layout(); plt.show()
Code cell 6
# === 1.2 Gradient Field and Critical Points in ℝ² ===
# Canonical 2D examples
def f_bowl(xy): x,y = xy; return x**2 + 2*y**2
def f_saddle2(xy): x,y = xy; return x**2 - y**2
def f_monkey(xy): x,y = xy; return x**2 + y**2 - 2*x*y # degenerate
xx, yy = np.meshgrid(np.linspace(-2,2,40), np.linspace(-2,2,40))
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax, f, title in zip(
axes,
[f_bowl, f_saddle2],
['Bowl: $f=x^2+2y^2$ (minimum at origin)',
'Saddle: $f=x^2-y^2$ (saddle at origin)']
):
Z = np.array([[f([xi,yi]) for xi in np.linspace(-2,2,100)]
for yi in np.linspace(-2,2,100)])
ax.contour(np.linspace(-2,2,100), np.linspace(-2,2,100), Z, 20, cmap='viridis')
# Plot gradient arrows (downsampled)
Gx = np.array([[grad_fd(f, np.array([xi,yi]))[0]
for xi in np.linspace(-2,2,10)]
for yi in np.linspace(-2,2,10)])
Gy = np.array([[grad_fd(f, np.array([xi,yi]))[1]
for xi in np.linspace(-2,2,10)]
for yi in np.linspace(-2,2,10)])
xs10 = np.linspace(-2,2,10)
ax.quiver(xs10, xs10, -Gx, -Gy, alpha=0.4, color='red', scale=50)
ax.plot(0, 0, 'ro', markersize=10, label='Critical point')
ax.set_title(title); ax.set_xlabel('x'); ax.set_ylabel('y')
ax.legend()
plt.suptitle('Gradient field and critical points in ℝ²', fontsize=14)
plt.tight_layout(); plt.show()
2. Second-Order Conditions and Hessian Spectrum
The Hessian determines whether a critical point is a minimum (all eigenvalues ), maximum (all ), or saddle (mixed).
Code cell 8
# === 2.1 Second-Order Test: Eigenvalue Analysis ===
def classify_critical_point(f, x_star, name=''):
"""Classify a critical point using Hessian eigenvalues."""
grad = grad_fd(f, x_star)
H = hessian_fd(f, x_star)
eigvals = np.linalg.eigvalsh(H)
print(f' Critical point: {x_star}')
print(f' ||grad f|| = {la.norm(grad):.2e} (should be ~0)')
print(f' Hessian eigenvalues: {eigvals}')
if np.all(eigvals > 1e-8):
typ = 'STRICT LOCAL MINIMUM'
elif np.all(eigvals < -1e-8):
typ = 'STRICT LOCAL MAXIMUM'
elif np.any(eigvals > 1e-8) and np.any(eigvals < -1e-8):
k = np.sum(eigvals < 0)
typ = f'SADDLE POINT (Morse index = {k})'
else:
typ = 'DEGENERATE (higher-order analysis needed)'
print(f' Classification: {typ}\n')
return eigvals, typ
# Test on known functions
print('=== Critical Point Classification ===')
f_bowl_nd = lambda x: x[0]**2 + 2*x[1]**2
f_saddle_nd = lambda x: x[0]**2 - x[1]**2
f_rosenbrock = lambda x: (1-x[0])**2 + 100*(x[1]-x[0]**2)**2
print('Bowl f(x,y) = x^2 + 2y^2 at (0,0):')
classify_critical_point(f_bowl_nd, np.array([0.,0.]))
print('Saddle f(x,y) = x^2 - y^2 at (0,0):')
classify_critical_point(f_saddle_nd, np.array([0.,0.]))
print('Rosenbrock f(x,y) at (1,1):')
classify_critical_point(f_rosenbrock, np.array([1.,1.]))
Code cell 9
# === 2.2 The ℝ² Determinant Test ===
# Verify the 2D determinant test: det(H) > 0 & H_11 > 0 => min
# det(H) > 0 & H_11 < 0 => max
# det(H) < 0 => saddle
test_cases = [
(lambda x: x[0]**2 + x[1]**2, np.zeros(2), 'min (x²+y²)'),
(lambda x: -(x[0]**2+x[1]**2), np.zeros(2), 'max (-(x²+y²))'),
(lambda x: x[0]**2 - x[1]**2, np.zeros(2), 'saddle (x²-y²)'),
(lambda x: x[0]**2 + x[0]*x[1] + x[1]**2, np.zeros(2), 'min (x²+xy+y²)'),
]
print('=== 2D Determinant Test ===')
for f, x0, name in test_cases:
H = hessian_fd(f, x0)
d = np.linalg.det(H)
h11 = H[0,0]
if d > 0 and h11 > 0: pred = 'LOCAL MIN'
elif d > 0 and h11 < 0: pred = 'LOCAL MAX'
elif d < 0: pred = 'SADDLE'
else: pred = 'INCONCLUSIVE'
print(f' {name}: det={d:.4f}, H_11={h11:.4f} => {pred}')
3. Convexity Analysis
Verifying convexity via second-order conditions and checking the defining inequality.
Code cell 11
# === 3.1 Second-Order Convexity Check ===
def is_convex_so(f, domain_pts, tol=1e-10):
"""Check convexity by verifying H(x) >= 0 at multiple points."""
results = []
for x in domain_pts:
H = hessian_fd(f, x)
eigvals = la.eigvalsh(H)
results.append(np.all(eigvals >= -tol))
return all(results), results
# Test grid
pts_1d = [np.array([x]) for x in np.linspace(-3, 3, 20)]
pts_2d = [np.array([x, y]) for x in np.linspace(-2,2,8) for y in np.linspace(-2,2,8)]
print('=== Convexity Verification (second-order) ===')
test_fns = [
(lambda x: x[0]**2, pts_1d, 'f(x)=x² (convex)'),
(lambda x: np.exp(x[0]), pts_1d, 'f(x)=exp(x) (convex)'),
(lambda x: np.log(x[0]+1e-8), [np.array([x]) for x in np.linspace(0.01,3,20)],
'f(x)=log(x) (CONCAVE, not convex)'),
(lambda x: x[0]**2 + x[1]**2, pts_2d, 'f(x,y)=x²+y² (convex)'),
(lambda x: x[0]**2 - x[1]**2, pts_2d, 'f(x,y)=x²-y² (NEITHER)'),
]
for f, pts, name in test_fns:
ok, _ = is_convex_so(f, pts)
print(f' {name}: convex={ok}')
Code cell 12
# === 3.2 Visualising Convexity: Chord Above Graph ===
# For a convex function, the chord between any two points lies above the graph
f_cvx = lambda x: x**2 + 0.5*np.abs(x)
f_ncvx = lambda x: np.sin(x)
x = np.linspace(-2, 2, 200)
x1, x2 = -1.5, 1.0
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax, f, title in zip(axes, [f_cvx, f_ncvx], ['Convex (chord above)', 'Non-convex (chord below in parts)']):
y = f(x)
ax.plot(x, y, 'b-', linewidth=2, label='f(x)')
# Chord
chord_x = np.linspace(x1, x2, 50)
chord_y = f(x1) + (f(x2)-f(x1))*(chord_x-x1)/(x2-x1)
ax.plot(chord_x, chord_y, 'r--', linewidth=2, label='Chord')
ax.plot([x1,x2], [f(x1),f(x2)], 'ro', markersize=8)
ax.set_title(title)
ax.legend()
ax.set_xlabel('x'); ax.set_ylabel('f(x)')
plt.suptitle('Convexity: chord lies above graph', fontsize=13)
plt.tight_layout(); plt.show()
Code cell 13
# === 3.3 Strong Convexity and Condition Number ===
# For mu-strongly convex, L-smooth: condition number kappa = L/mu
# governs convergence rate of gradient descent: (1 - mu/L)^T
mu_vals = [0.1, 1.0, 5.0]
L = 10.0
print('=== Strong Convexity and Convergence ===')
T = 100
for mu in mu_vals:
kappa = L / mu
rate = (1 - mu/L)**T # GD convergence factor after T steps
print(f' mu={mu:.1f}: kappa={kappa:.1f}, GD factor after {T} steps = {rate:.4f}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
T_range = np.arange(1, 201)
for mu in [0.1, 0.5, 1.0, 5.0]:
kappa = L/mu
rates = (1 - mu/L)**T_range
ax.semilogy(T_range, rates, label=f'mu={mu}, kappa={kappa:.0f}')
ax.set_xlabel('Iteration T'); ax.set_ylabel('Convergence factor (1-mu/L)^T')
ax.set_title(f'GD Convergence Rate vs. Strong Convexity (L={L})')
ax.legend(); ax.grid(True, alpha=0.4)
plt.tight_layout(); plt.show()
print('\nTakeaway: Strong convexity (large mu) dramatically speeds up convergence.')
print('Ridge regularisation adds mu to all eigenvalues of H, enabling faster training.')
4. Lagrange Multipliers
Equality-constrained optimisation: the Lagrangian, geometric tangency condition, and the shadow price interpretation.
Code cell 15
# === 4.1 Lagrange Multiplier Geometry ===
# Minimise f(x,y) = x^2 + y^2 (distance from origin) subject to x + 2y = 5
# Lagrangian: L = x^2 + y^2 + lambda*(x + 2y - 5)
# Stationarity: 2x + lambda = 0, 2y + 2*lambda = 0
# => x = -lambda/2, y = -lambda
# Constraint: -lambda/2 + 2*(-lambda) = 5 => -5lambda/2 = 5 => lambda* = -2
# => x* = 1, y* = 2, f* = 1 + 4 = 5
x_star = np.array([1.0, 2.0])
lambda_star = -2.0
f_val = x_star[0]**2 + x_star[1]**2
constraint = x_star[0] + 2*x_star[1] - 5
print('=== Lagrange Multiplier: Distance to Line ===')
print(f'Optimal x* = {x_star}, lambda* = {lambda_star}')
print(f'f(x*) = {f_val} (should be 5)')
print(f'Constraint residual: {constraint:.2e} (should be ~0)')
# Verify: gradient condition
grad_f = 2*x_star
grad_g = np.array([1.0, 2.0]) # gradient of g(x,y) = x + 2y - 5
stationarity = grad_f + lambda_star * grad_g
print(f'Stationarity ||grad L|| = {la.norm(stationarity):.2e}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(7, 6))
x = np.linspace(-1, 4, 200)
# Level curves of f
for r in [1, np.sqrt(5), 2*np.sqrt(5), 3*np.sqrt(5)]:
theta = np.linspace(0, 2*np.pi, 200)
ax.plot(r*np.cos(theta), r*np.sin(theta), 'b-', alpha=0.3)
# Constraint line
ax.plot(x, (5-x)/2, 'g-', linewidth=2, label='$x+2y=5$')
# Optimal point
ax.plot(*x_star, 'ro', markersize=10, label='$x^*=(1,2)$')
# Gradient vectors
scale = 0.3
ax.annotate('', xy=x_star+scale*grad_f/la.norm(grad_f),
xytext=x_star, arrowprops=dict(color='red', width=2))
ax.annotate('', xy=x_star+scale*grad_g/la.norm(grad_g),
xytext=x_star, arrowprops=dict(color='green', width=2))
ax.text(x_star[0]+0.1, x_star[1]+0.3, '$\\nabla f$', color='red', fontsize=12)
ax.text(x_star[0]-0.5, x_star[1]-0.3, '$\\nabla g$', color='green', fontsize=12)
ax.set_xlim(-1, 4); ax.set_ylim(-1, 4)
ax.set_aspect('equal')
ax.set_xlabel('x'); ax.set_ylabel('y')
ax.set_title('Lagrange: $\\nabla f \\parallel \\nabla g$ at optimum')
ax.legend(); plt.tight_layout(); plt.show()
Code cell 16
# === 4.2 Shadow Price: Envelope Theorem Verification ===
# Constrained problem: min x^2 + y^2 s.t. x + 2y = b
# Optimal: x* = b/5, y* = 2b/5, p*(b) = b^2/5
# Shadow price: dp*/db = 2b/5... at b=5: dp*/db = 2
# Lagrange multiplier: lambda* = -2b/5... at b=5: lambda* = -2
# Envelope: dp*/db = lambda* * (partial g/partial b) = lambda* * 1 = lambda*
# Check: dp*/db|_{b=5} = 2, lambda* = -2... sign convention: dp*/db = -lambda* = 2 ✓
def p_star(b):
# min x^2+y^2 s.t. x+2y=b: optimal value = b^2/5
return b**2 / 5.0
b_vals = np.linspace(1, 10, 100)
p_vals = p_star(b_vals)
dp_db_analytic = 2*b_vals/5 # derivative of b^2/5
dp_db_fd = (p_star(b_vals+1e-5) - p_star(b_vals-1e-5)) / (2e-5)
print('=== Envelope Theorem: Shadow Price Verification ===')
b_test = 5.0
lambda_star_at5 = -2*b_test/5 # Lambda* at b=5 (with our sign convention: grad f + lambda grad g = 0)
dp_db_at5 = 2*b_test/5
print(f'At b=5: dp*/db = {dp_db_at5:.4f}')
print(f' -lambda* = {-lambda_star_at5:.4f} (shadow price = -lambda*)')
import numpy.testing as npt
npt.assert_allclose(dp_db_at5, -lambda_star_at5, rtol=1e-6)
print('PASS — Shadow price equals -lambda* (envelope theorem verified)')
if HAS_MPL:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(b_vals, p_vals, 'b-', linewidth=2, label='$p^*(b) = b^2/5$')
ax1.set_xlabel('b (constraint RHS)'); ax1.set_ylabel('Optimal value $p^*$')
ax1.set_title('Optimal Value vs. Constraint Parameter')
ax1.legend()
ax2.plot(b_vals, dp_db_analytic, 'b-', linewidth=2, label='Analytic $dp^*/db$')
ax2.plot(b_vals, dp_db_fd, 'r--', linewidth=2, label='Finite difference')
ax2.set_xlabel('b'); ax2.set_ylabel('Shadow price')
ax2.set_title('Shadow Price $dp^*/db$ (= $-\\lambda^*$)')
ax2.legend()
plt.tight_layout(); plt.show()
Code cell 17
# === 4.3 PCA as Lagrange Multiplier Problem ===
# Find v1 = argmax v^T Sigma v s.t. ||v||^2 = 1
# KKT: 2 Sigma v = 2 lambda v => Sigma v = lambda v (eigenvalue)
# Optimal: v1 = top eigenvector, lambda* = top eigenvalue = max variance
np.random.seed(42)
n_samples = 500
# Correlated 2D data
rho = 0.8
Sigma_true = np.array([[1.0, rho], [rho, 1.0]])
X = np.random.multivariate_normal([0,0], Sigma_true, n_samples)
Sigma = X.T @ X / n_samples
# Solve PCA via eigendecomposition (= Lagrange KKT solution)
eigvals, eigvecs = la.eigh(Sigma)
idx = np.argsort(eigvals)[::-1]
eigvals, eigvecs = eigvals[idx], eigvecs[:, idx]
v1 = eigvecs[:, 0]
lambda1 = eigvals[0]
print('=== PCA via Lagrange Multipliers ===')
print(f'True Sigma:\n{Sigma_true}')
print(f'Sample Sigma:\n{Sigma.round(3)}')
print(f'Top eigenvector v1 = {v1.round(4)}')
print(f'Top eigenvalue lambda1 = {lambda1:.4f} (= captured variance)')
print(f'Variance captured: {100*lambda1/eigvals.sum():.1f}%')
# Verify KKT: Sigma v1 = lambda1 v1
kkt_residual = Sigma @ v1 - lambda1 * v1
print(f'KKT stationarity residual ||Sigma v1 - lambda1 v1|| = {la.norm(kkt_residual):.2e}')
print(f'||v1|| = {la.norm(v1):.6f} (constraint satisfied)')
5. KKT Conditions
The four KKT conditions for inequality-constrained problems: stationarity, primal feasibility, dual feasibility, and complementary slackness.
Code cell 19
# === 5.1 KKT Verification Function ===
def verify_kkt(x_star, mu_star, lambda_star, f_grad, h_vals, h_grads, g_vals, g_grads, tol=1e-5):
"""Verify all four KKT conditions."""
# 1. Stationarity
stationarity = f_grad.copy()
for mu, hg in zip(mu_star, h_grads):
stationarity += mu * hg
for lam, gg in zip(lambda_star, g_grads):
stationarity += lam * gg
cond1 = np.allclose(stationarity, 0, atol=tol)
# 2. Primal feasibility
ineq_ok = all(h <= tol for h in h_vals)
eq_ok = all(abs(g) <= tol for g in g_vals)
cond2 = ineq_ok and eq_ok
# 3. Dual feasibility
cond3 = all(mu >= -tol for mu in mu_star)
# 4. Complementary slackness
cs_vals = [mu * h for mu, h in zip(mu_star, h_vals)]
cond4 = all(abs(cs) <= tol for cs in cs_vals)
print(f' (1) Stationarity: {"PASS" if cond1 else "FAIL"}')
print(f' (2) Primal feasibility: {"PASS" if cond2 else "FAIL"}')
print(f' (3) Dual feasibility: {"PASS" if cond3 else "FAIL"}')
print(f' (4) Complementary slack: {"PASS" if cond4 else "FAIL"}')
return cond1 and cond2 and cond3 and cond4
# Example: min (x1-2)^2 + (x2-1)^2 s.t. x1 + x2 <= 3, x1 >= 0, x2 >= 0
# Unconstrained min at (2,1) which is feasible => KKT point is (2,1) with all mu*=0
x_star = np.array([2.0, 1.0])
f_grad_at_star = 2*(x_star - np.array([2.0, 1.0])) # = [0, 0]
h_vals = [x_star[0]+x_star[1]-3, -x_star[0], -x_star[1]] # <= 0
h_grads = [np.array([1.,1.]), np.array([-1.,0.]), np.array([0.,-1.])]
mu_star = [0.0, 0.0, 0.0] # all inactive
print('=== KKT Verification: Interior Point ===')
print('min (x1-2)^2+(x2-1)^2 s.t. x1+x2<=3, x1,x2>=0')
print(f'x* = {x_star}, mu* = {mu_star}')
verify_kkt(x_star, mu_star, [], f_grad_at_star, h_vals, h_grads, [], [])
Code cell 20
# === 5.2 KKT with Active Constraint: Constrained Minimum ===
# min x1^2 + x2^2 s.t. x1 + x2 = 3 (equality, active)
# x1 <= 2 (inequality)
# Unconstrained min of x1^2+x2^2 is (0,0), not feasible for eq constraint
# With x1+x2=3: min on line x1+x2=3 is at x1=x2=1.5 (equidistant point)
# Check if x1 <= 2: 1.5 <= 2 TRUE, so inequality is inactive
x_star = np.array([1.5, 1.5])
lambda_eq = -3.0 # From 2*x1 + lambda = 0 => lambda = -2*1.5 = -3
mu_ineq = 0.0 # inactive
f_grad = 2*x_star # grad of x1^2+x2^2
g_grad = np.array([1., 1.]) # grad of x1+x2-3 (equality)
h_grad = np.array([1., 0.]) # grad of x1-2 (inequality h<=0)
print('=== KKT: Active Equality, Inactive Inequality ===')
print(f'x* = {x_star}, lambda_eq* = {lambda_eq}, mu_ineq* = {mu_ineq}')
verify_kkt(
x_star,
mu_star=[mu_ineq],
lambda_star=[lambda_eq],
f_grad=f_grad,
h_vals=[x_star[0]-2],
h_grads=[h_grad],
g_vals=[x_star[0]+x_star[1]-3],
g_grads=[g_grad]
)
print(f'\nComplementary slackness: mu*_ineq={mu_ineq} * h(x*)={x_star[0]-2} = {mu_ineq*(x_star[0]-2)}')
print('Inactive constraint => mu* = 0 (constraint does not matter at optimal)')
Code cell 21
# === 5.3 Complementary Slackness Visualisation ===
# Parametric study: min (x-a)^2 s.t. x <= 1
# For a <= 1: unconstrained min x*=a, mu*=0 (inactive)
# For a > 1: constrained min x*=1, constraint active, mu*=2(a-1)>0
a_vals = np.linspace(-1, 3, 100)
x_stars = np.minimum(a_vals, 1.0) # optimal solution
mu_stars = np.maximum(0, 2*(a_vals - 1)) # multiplier
obj_vals = (x_stars - a_vals)**2
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
axes[0].plot(a_vals, x_stars, 'b-', linewidth=2)
axes[0].axvline(1, color='r', linestyle='--', alpha=0.5)
axes[0].set_xlabel('a'); axes[0].set_ylabel('x* (optimal)')
axes[0].set_title('Optimal Solution')
axes[1].plot(a_vals, mu_stars, 'g-', linewidth=2)
axes[1].axvline(1, color='r', linestyle='--', alpha=0.5)
axes[1].set_xlabel('a'); axes[1].set_ylabel('mu* (multiplier)')
axes[1].set_title('Lagrange Multiplier')
axes[2].plot(a_vals, mu_stars * (x_stars - 1), 'r-', linewidth=2)
axes[2].axhline(0, color='k', linestyle='--', alpha=0.3)
axes[2].set_xlabel('a'); axes[2].set_ylabel('mu* * h(x*)')
axes[2].set_title('Complementary Slackness (should be 0)')
plt.suptitle('min (x-a)² s.t. x ≤ 1: KKT conditions vs parameter a', fontsize=12)
plt.tight_layout(); plt.show()
print('Complementary slackness always = 0 by design.')
print('When a <= 1: x*=a (interior), mu*=0 (zero times slack)')
print('When a > 1: x*=1 (on boundary), h(x*)=0 (active, zero times mu*)')
6. Duality Theory
Weak duality, strong duality, the dual problem, and duality gap monitoring.
Code cell 23
# === 6.1 Weak Duality Demonstration ===
# LP example: max 2x + 3y s.t. x + y <= 4, x,y >= 0
# Equivalently: min -2x - 3y s.t. x+y<=4, x>=0, y>=0
# Primal optimal: x*=0, y*=4, p* = -12
# Dual: for each (mu1, mu2, mu3) >= 0, q(mu) = inf_x,y L(x,y,mu)
# L = -2x-3y + mu1(x+y-4) - mu2*x - mu3*y
# = (-2+mu1-mu2)x + (-3+mu1-mu3)y - 4*mu1
# For q to be finite: coefficients of x,y must be 0
# => mu1-mu2=2, mu1-mu3=3, q = -4*mu1
# Dual: min 4*mu1 s.t. mu1>=2, mu1>=3, mu1,mu2,mu3>=0
# => mu1*=3, q* = -12 = p* (strong duality holds)
print('=== LP Duality ===')
# Primal optimal
x_star, y_star = 0.0, 4.0
p_star = -2*x_star - 3*y_star
print(f'Primal: x*=0, y*=4, p* = {p_star}')
# Dual at various multipliers — show weak duality
print('\nDual function values q(mu) <= p* = -12:')
for mu1 in [2.5, 3.0, 3.5, 4.0]:
# With mu1-mu2=2, mu1-mu3=3 => q = -4*mu1
q = -4 * mu1
print(f' mu1={mu1}: q={q:.1f} <= p*={p_star}? {q <= p_star + 1e-8}')
print(f'\nDual optimal: mu1*=3, d* = -12 = p* (strong duality, zero gap)')
Code cell 24
# === 6.2 SVM Primal-Dual Equivalence ===
# Implement hard-margin SVM primal and dual, verify strong duality
from scipy.optimize import minimize as scipy_min
np.random.seed(0)
# Linearly separable 2D data
n_pos, n_neg = 10, 10
X_pos = np.random.randn(n_pos, 2) + np.array([2.0, 0.0])
X_neg = np.random.randn(n_neg, 2) + np.array([-2.0, 0.0])
X_svm = np.vstack([X_pos, X_neg])
y_svm = np.hstack([np.ones(n_pos), -np.ones(n_neg)])
n = len(y_svm)
# SVM Dual: max sum(alpha) - 0.5 * alpha^T Q alpha
# s.t. 0 <= alpha <= inf, sum(alpha_i y_i) = 0
# Q_ij = y_i y_j x_i^T x_j
Q = np.outer(y_svm, y_svm) * (X_svm @ X_svm.T)
def dual_objective(alpha):
return 0.5 * alpha @ Q @ alpha - alpha.sum()
def dual_gradient(alpha):
return Q @ alpha - np.ones(n)
from scipy.optimize import minimize as spmin
constraints = [{'type': 'eq', 'fun': lambda a: a @ y_svm}]
bounds = [(0, None)] * n
alpha0 = np.ones(n) * 0.1
res = spmin(dual_objective, alpha0, jac=dual_gradient,
method='SLSQP', bounds=bounds, constraints=constraints)
alpha_star = res.x
# Recover primal from dual
w_star = (alpha_star * y_svm) @ X_svm
# b: use support vectors (alpha > threshold)
sv_mask = alpha_star > 1e-4
b_star = np.mean(y_svm[sv_mask] - X_svm[sv_mask] @ w_star)
primal_val = 0.5 * la.norm(w_star)**2
dual_val = -res.fun # dual maximises, scipy minimises
print('=== SVM Primal-Dual ===')
print(f'w* = {w_star.round(4)}, b* = {b_star:.4f}')
print(f'Primal value (0.5 ||w||^2) = {primal_val:.6f}')
print(f'Dual value = {dual_val:.6f}')
print(f'Duality gap = {abs(primal_val - dual_val):.2e} (should be ~0)')
print(f'Support vectors: {sv_mask.sum()}')
# Check complementary slackness: alpha_i * (y_i(w^T x_i + b) - 1) = 0
margins = y_svm * (X_svm @ w_star + b_star) - 1
cs_violations = np.max(np.abs(alpha_star * margins))
print(f'Max CS violation: {cs_violations:.2e} (should be ~0)')
Code cell 25
# === 6.3 Duality Gap Convergence (Interior Point) ===
# Track duality gap for a simple LP as penalty parameter decreases
# min c^T x s.t. Ax <= b (using log barrier method)
# Simple 2D LP: min -x1 - 2*x2 s.t. x1+x2<=4, x1<=3, x2<=3, x1,x2>=0
# Optimal: (1, 3) -> objective = -1-6 = -7
def lp_primal(x):
return -x[0] - 2*x[1]
# Solve with scipy for reference
c = np.array([-1., -2.])
A_ub = np.array([[1,1],[1,0],[0,1],[-1,0],[0,-1]])
b_ub = np.array([4., 3., 3., 0., 0.])
lp_res = linprog(c, A_ub=A_ub, b_ub=b_ub, method='highs')
print('=== LP Solved with scipy.linprog ===')
print(f'Optimal x* = {lp_res.x.round(4)}')
print(f'Optimal value = {lp_res.fun:.4f}')
print(f'Status: {lp_res.message}')
# Log barrier: phi_t(x) = -c^T x - t * sum(log(-h_i(x)))
# As t -> 0, phi_t -> the original LP
def log_barrier(x, t):
slack = b_ub - A_ub @ x
if np.any(slack <= 0): return np.inf
return c @ x - t * np.sum(np.log(slack))
t_vals = [1.0, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]
x_bar = np.array([1.5, 1.5]) # strictly feasible starting point
barrier_vals = []
for t in t_vals:
res_t = spmin(lambda x: log_barrier(x, t), x_bar, method='Nelder-Mead',
options={'xatol':1e-8, 'fatol':1e-8, 'maxiter':10000})
x_bar = res_t.x
primal_val = c @ x_bar
barrier_vals.append(primal_val)
print('\nBarrier method convergence to LP optimal:')
for t, pv in zip(t_vals, barrier_vals):
gap = abs(pv - lp_res.fun)
print(f' t={t:.3f}: primal={pv:.4f}, gap={gap:.2e}')
7. ML Applications: Normal Equations and Logistic Regression
Applying first-order optimality conditions to derive closed-form solutions for linear regression and verify convexity of logistic loss.
Code cell 27
# === 7.1 Normal Equations as KKT (OLS and Ridge) ===
np.random.seed(42)
n, d = 100, 5
X = np.random.randn(n, d)
w_true = np.array([1., -2., 0.5, 3., -1.])
y = X @ w_true + 0.3 * np.random.randn(n)
# OLS normal equations: X^T X w = X^T y
w_ols = la.solve(X.T @ X, X.T @ y)
# Ridge normal equations: (X^T X + lambda I) w = X^T y
lam = 1.0
w_ridge = la.solve(X.T @ X + lam*np.eye(d), X.T @ y)
# Verify: gradient should be zero at solution
grad_ols = X.T @ (X @ w_ols - y) / n
grad_ridge = X.T @ (X @ w_ridge - y) / n + lam * w_ridge / n
print('=== Normal Equations: OLS and Ridge ===')
print(f'True w: {w_true}')
print(f'OLS w: {w_ols.round(3)}')
print(f'Ridge w (lam={lam}): {w_ridge.round(3)}')
print(f'||grad OLS|| = {la.norm(grad_ols):.2e} (should be ~0)')
print(f'||grad Ridge|| = {la.norm(grad_ridge):.2e} (should be ~0)')
# Compare with sklearn
try:
from sklearn.linear_model import LinearRegression, Ridge
ols_sk = LinearRegression(fit_intercept=False).fit(X, y)
ridge_sk = Ridge(alpha=lam, fit_intercept=False).fit(X, y)
import numpy.testing as npt
npt.assert_allclose(w_ols, ols_sk.coef_, rtol=1e-5)
npt.assert_allclose(w_ridge, ridge_sk.coef_, rtol=1e-5)
print('PASS — matches sklearn OLS and Ridge')
except ImportError:
print('sklearn not available, skipping comparison')
Code cell 28
# === 7.2 Hessian Analysis: Ridge vs OLS Conditioning ===
# Hessian of OLS: (1/n) X^T X -- can be ill-conditioned
# Hessian of Ridge: (1/n) X^T X + lambda I -- always positive definite
lam_vals = [0.0, 0.1, 0.5, 1.0, 5.0]
print('=== Hessian Eigenvalue Analysis: Ridge Regularisation ===')
eigvals_XtX = la.eigvalsh(X.T @ X / n)
print(f'Eigenvalues of (1/n)X^TX: min={eigvals_XtX.min():.4f}, max={eigvals_XtX.max():.4f}')
print(f'OLS condition number: {eigvals_XtX.max()/eigvals_XtX.min():.1f}')
print()
for lam in lam_vals:
H = X.T @ X / n + lam * np.eye(d)
eigs = la.eigvalsh(H)
kappa = eigs.max() / eigs.min()
print(f' lambda={lam:.1f}: min_eig={eigs.min():.4f}, cond={kappa:.1f}')
print('\nRidge adds lambda to all eigenvalues, bounding min eigenvalue away from 0.')
print('This makes the problem strongly convex (mu = lambda) and well-conditioned.')
Code cell 29
# === 7.3 Logistic Regression: Convex Loss Verification ===
def sigmoid(z): return 1 / (1 + np.exp(-np.clip(z, -30, 30)))
# Logistic loss: f(w) = (1/n) sum_i [-y_i w^T x_i + log(1 + exp(w^T x_i))]
# Hessian: (1/n) X^T diag(sigma_i(1-sigma_i)) X >= 0 always
np.random.seed(42)
n_lr, d_lr = 50, 3
X_lr = np.random.randn(n_lr, d_lr)
w_true_lr = np.array([1., -1., 0.5])
y_lr = (sigmoid(X_lr @ w_true_lr) > 0.5).astype(float)
def logistic_loss(w):
z = X_lr @ w
return np.mean(-y_lr * z + np.log1p(np.exp(np.clip(z, -50, 50))))
def logistic_hessian(w):
z = X_lr @ w
sigma = sigmoid(z)
D = np.diag(sigma * (1 - sigma))
return X_lr.T @ D @ X_lr / n_lr
# Check PSD at multiple points
test_ws = [np.zeros(d_lr), np.ones(d_lr), np.random.randn(d_lr), w_true_lr]
print('=== Logistic Loss Hessian: PSD Verification ===')
for w in test_ws:
H = logistic_hessian(w)
eigs = la.eigvalsh(H)
psd = np.all(eigs >= -1e-10)
print(f' w={w.round(2)}: min_eig={eigs.min():.6f}, PSD={psd}')
# Find minimum via gradient descent
w = np.zeros(d_lr)
lr = 0.5
for _ in range(1000):
g = grad_fd(logistic_loss, w)
w -= lr * g
grad_at_min = grad_fd(logistic_loss, w)
print(f'\nOptimal w: {w.round(4)}')
print(f'||grad|| at minimum: {la.norm(grad_at_min):.2e} (should be ~0)')
Code cell 30
# === 7.4 Maximum Entropy: Softmax from Lagrange Multipliers ===
# Max entropy over {1,...,K} subject to:
# sum_i p_i = 1 (normalisation)
# sum_i p_i * i = mu_target (mean constraint)
# Solution: p_i* = exp(-lambda_0 - lambda_1 * i) / Z
# <=> p_i* = softmax(-lambda_1 * [1,...,K])
from scipy.optimize import fsolve
K = 6
items = np.arange(1, K+1).astype(float)
mu_target = 3.0
def max_entropy_dist(lam1):
"""Given lambda_1, compute max entropy distribution."""
log_p = -lam1 * items
log_p -= np.max(log_p) # numerical stability
p = np.exp(log_p)
return p / p.sum()
def mean_constraint(lam1):
p = max_entropy_dist(lam1)
return p @ items - mu_target
lam1_star = fsolve(mean_constraint, 0.5)[0]
p_maxent = max_entropy_dist(lam1_star)
p_uniform = np.ones(K) / K
def entropy(p): return -np.sum(p * np.log(p + 1e-12))
print('=== Maximum Entropy Distribution ===')
print(f'K={K} outcomes, target mean={mu_target}')
print(f'lambda_1* = {lam1_star:.4f}')
print(f'Max-entropy distribution: {p_maxent.round(4)}')
print(f'Mean of max-ent dist: {p_maxent @ items:.6f} (target: {mu_target})')
print(f'H(max-ent) = {entropy(p_maxent):.4f}')
print(f'H(uniform) = {entropy(p_uniform):.4f}')
print(f'Max-ent has higher entropy: {entropy(p_maxent) > entropy(p_uniform)}')
# Connection to softmax: p_maxent is softmax(-lambda_1 * items)
softmax_p = np.exp(-lam1_star * items) / np.exp(-lam1_star * items).sum()
print(f'\nSoftmax formulation matches: {np.allclose(p_maxent, softmax_p, atol=1e-8)}')
print('Lagrange multiplier lambda_1* = negative of softmax logit = 1/temperature')
8. Non-Convex Landscape: Saddle Points and Deep Learning
Code cell 32
# === 8.1 Saddle Point Dominance in High Dimensions ===
# Simulate random critical points in high dimensions
# For a Gaussian random function, critical points near the global min
# are overwhelmingly saddles in high dimensions.
# Simplified model: at a critical point, Hessian eigenvalues are
# drawn from GOE (Gaussian Orthogonal Ensemble) distribution
# shifted by the loss value: lambda_i = f_value + N(0, sigma^2)
np.random.seed(42)
n_dims = [10, 50, 100, 500]
print('=== Saddle Point Dominance Analysis ===')
print('Simulating random Hessians at critical points near different loss values:')
for n_d in n_dims:
n_trials = 200
n_minima = 0
n_saddles = 0
# Critical points near a good solution (low loss): Hessian mostly PD
for _ in range(n_trials):
# Diagonal Hessian: eigenvalues drawn from |N(mu, sigma)|^2
# Near global min: most positive, few negative
eigs = np.random.randn(n_d) + 2.0 # shifted positive
if np.all(eigs > 0):
n_minima += 1
else:
n_saddles += 1
pct_min = 100 * n_minima / n_trials
print(f' n={n_d}: {pct_min:.0f}% local minima, {100-pct_min:.0f}% saddles')
print('\n(Near the global minimum, random critical points are mostly saddles in high n)')
Code cell 33
# === 8.2 SAM: KKT Solution for Inner Maximisation ===
# SAM inner problem: max_eps L(w + eps) s.t. ||eps|| <= rho
# KKT: grad_eps L(w+eps*) = 2*mu* eps* (stationarity)
# => eps* || grad L(w) (parallel)
# eps* = rho * grad L(w) / ||grad L(w)|| (on boundary)
def sam_step(w, f, rho=0.05, eta=0.01):
"""One SAM update: perturb + gradient step."""
g = grad_fd(f, w)
# KKT solution: eps* = rho * g / ||g||
eps_star = rho * g / (la.norm(g) + 1e-12)
# Gradient at perturbed point
g_perturbed = grad_fd(f, w + eps_star)
return w - eta * g_perturbed, eps_star
# Sharp quadratic: f(x,y) = 0.5*(100 x^2 + y^2)
# Min at (0,0); very sharp in x-direction
f_sharp = lambda w: 0.5 * (100 * w[0]**2 + w[1]**2)
f_flat = lambda w: 0.5 * (w[0]**2 + w[1]**2) # flat reference
np.random.seed(42)
w0 = np.array([1.0, 1.0])
rho = 0.1; eta = 0.005
# Standard GD on sharp function
w_gd = w0.copy()
traj_gd = [w_gd.copy()]
for _ in range(50):
g = grad_fd(f_sharp, w_gd)
w_gd = w_gd - eta * g
traj_gd.append(w_gd.copy())
traj_gd = np.array(traj_gd)
# SAM on sharp function
w_sam = w0.copy()
traj_sam = [w_sam.copy()]
for _ in range(50):
w_sam, eps = sam_step(w_sam, f_sharp, rho=rho, eta=eta)
traj_sam.append(w_sam.copy())
traj_sam = np.array(traj_sam)
print('=== SAM vs GD on Sharp Quadratic ===')
print(f'Initial: w={w0}, f={f_sharp(w0):.2f}')
print(f'GD final: w={traj_gd[-1].round(4)}, f={f_sharp(traj_gd[-1]):.4f}')
print(f'SAM final: w={traj_sam[-1].round(4)}, f={f_sharp(traj_sam[-1]):.4f}')
# Verify KKT: eps* should be parallel to gradient
g0 = grad_fd(f_sharp, w0)
eps_kkt = rho * g0 / la.norm(g0)
print(f'\nKKT verification at w={w0}:')
print(f' eps_kkt = {eps_kkt.round(4)}')
print(f' ||eps_kkt|| = {la.norm(eps_kkt):.4f} (should be {rho})')
print(f' eps_kkt parallel to grad? cos={np.dot(eps_kkt, g0)/(la.norm(eps_kkt)*la.norm(g0)):.6f}')
Code cell 34
# === 8.3 Visualising the Loss Landscape ===
# Compare sharp vs flat loss landscapes and gradient trajectories
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax, traj, title in zip(
axes,
[traj_gd, traj_sam],
['Gradient Descent (sharp quadratic)', 'SAM (rho=0.1)']
):
x_range = np.linspace(-1.2, 1.2, 100)
y_range = np.linspace(-1.2, 1.2, 100)
Z = np.array([[f_sharp(np.array([xi, yi])) for xi in x_range] for yi in y_range])
ax.contour(x_range, y_range, Z, levels=np.logspace(-2, 2, 20), cmap='viridis', alpha=0.6)
ax.plot(traj[:, 0], traj[:, 1], 'r-o', markersize=3, linewidth=2, label='Trajectory')
ax.plot(traj[0, 0], traj[0, 1], 'bs', markersize=10, label='Start')
ax.plot(traj[-1, 0], traj[-1, 1], 'r*', markersize=12, label='End')
ax.set_title(title)
ax.set_xlabel('x'); ax.set_ylabel('y')
ax.legend()
plt.suptitle('Sharp Quadratic: 0.5*(100x²+y²)', fontsize=13)
plt.tight_layout(); plt.show()
Code cell 35
# === 8.4 Neural Collapse: ETF Structure ===
# Show that the Equiangular Tight Frame (ETF) is the KKT point
# of the Unconstrained Features Model for C-class classification
C = 4 # number of classes
d = 10 # feature dimension (d >= C)
# ETF: C vectors with equal norms and equal pairwise cosine similarity = -1/(C-1)
# Construction: columns of (I - 11^T/C) after orthonormalization
def construct_etf(C, d):
"""Construct an ETF with C vectors in R^d."""
assert d >= C - 1, 'ETF requires d >= C-1'
# Use random orthonormal basis and project to (C-1)-dim subspace
np.random.seed(42)
H = np.random.randn(d, C)
# Center: subtract row means
H -= H.mean(axis=1, keepdims=True)
# Gram-Schmidt
from numpy.linalg import qr
Q, _ = qr(H)
M = Q[:, :C-1] @ np.random.randn(C-1, C) # project to C-1 dim
# Normalize columns
M /= la.norm(M, axis=0, keepdims=True)
return M
# Simple ETF: use discrete Fourier matrix (exact for C <= d)
def etf_simplex(C):
"""Simplex ETF: C vectors in R^C with equal cosine similarity."""
# Use rows of (I - (1/C) 11^T) normalised
M = np.eye(C) - np.ones((C,C))/C
norms = la.norm(M, axis=0, keepdims=True)
return M / norms
M_etf = etf_simplex(C)
inner_prods = M_etf.T @ M_etf
target_diag = 1.0
target_offdiag = -1/(C-1)
print('=== Neural Collapse: ETF Verification ===')
print(f'C={C} classes, feature dim={C}')
print(f'Expected off-diagonal cosine similarity: {target_offdiag:.4f}')
diag_ok = np.allclose(np.diag(inner_prods), 1.0, atol=1e-10)
offdiag_ok = np.allclose(inner_prods - np.diag(np.diag(inner_prods)), target_offdiag, atol=1e-10)
print(f'All diagonal entries = 1: {diag_ok}')
print(f'All off-diagonal = -1/(C-1)={target_offdiag:.4f}: {offdiag_ok}')
print(f'\nInner product matrix:\n{inner_prods.round(4)}')
print('\nETF is the unique KKT point of the UFM at terminal training.')
9. RLHF and DPO: KKT in Language Model Alignment
Code cell 37
# === 9.1 RLHF Optimal Policy: Boltzmann Distribution ===
# RLHF: max E[r(x,y)] s.t. KL(pi || pi_ref) <= delta
# KKT optimal policy: pi*(y|x) ∝ pi_ref(y|x) * exp(r(x,y)/beta)
# Simulate with discrete outcome space
K_rlhf = 5 # number of responses
np.random.seed(42)
# Reference policy (uniform-ish)
pi_ref = np.array([0.1, 0.3, 0.25, 0.2, 0.15])
pi_ref /= pi_ref.sum()
# Reward function (quality of responses)
r = np.array([0.5, 1.0, 2.0, 0.8, -0.5])
def optimal_rlhf_policy(pi_ref, r, beta):
"""KKT solution to RLHF: Boltzmann tilt of pi_ref."""
log_pi = np.log(pi_ref + 1e-12) + r / beta
log_pi -= log_pi.max() # numerical stability
pi = np.exp(log_pi)
return pi / pi.sum()
def kl_div(p, q):
"""KL(p || q)"""
return np.sum(p * (np.log(p+1e-12) - np.log(q+1e-12)))
print('=== RLHF Optimal Policy (KKT Boltzmann Solution) ===')
print(f'Reference policy: {pi_ref.round(3)}')
print(f'Reward: {r}')
print()
betas = [0.1, 0.5, 1.0, 5.0, 100.0]
for beta in betas:
pi_star = optimal_rlhf_policy(pi_ref, r, beta)
expected_r = pi_star @ r
kl = kl_div(pi_star, pi_ref)
print(f' beta={beta:.1f}: E[r]={expected_r:.3f}, KL={kl:.3f}, best_response={pi_star.argmax()}')
print()
print('Small beta: reward-seeking (high E[r], high KL from ref)')
print('Large beta: conservative (close to ref, lower E[r])')
print('Beta is the Lagrange multiplier on the KL constraint.')
Code cell 38
# === 9.2 DPO: KKT Reparameterisation ===
# DPO insight: from KKT of RLHF,
# r(x,y) = beta * log(pi*(y|x) / pi_ref(y|x)) + beta * log Z(x)
# Since log Z(x) is constant across responses, the reward ranking is:
# r(y_w) > r(y_l) <=> pi*(y_w)/pi_ref(y_w) > pi*(y_l)/pi_ref(y_l)
# DPO objective: -E[log sigma(beta * log(pi(y_w)/pi_ref(y_w)) - beta * log(pi(y_l)/pi_ref(y_l)))]
# Demonstrate that DPO implicit reward matches the true reward ordering
beta_dpo = 1.0
# Use beta=1 optimal policy as 'trained model'
pi_trained = optimal_rlhf_policy(pi_ref, r, beta_dpo)
# DPO implicit reward for each response
dpo_reward = beta_dpo * (np.log(pi_trained + 1e-12) - np.log(pi_ref + 1e-12))
# This should rank responses the same as r (up to a constant)
print('=== DPO: Implicit Reward Recovery ===')
print(f'True reward r: {r}')
print(f'DPO implicit reward: {dpo_reward.round(4)}')
print()
# Verify ranking matches
true_rank = np.argsort(r)[::-1]
dpo_rank = np.argsort(dpo_reward)[::-1]
print(f'True reward ranking: {true_rank}')
print(f'DPO reward ranking: {dpo_rank}')
print(f'Rankings match: {np.array_equal(true_rank, dpo_rank)}')
print()
print('DPO recovers the reward ranking from KKT structure alone.')
print('No RL required: the optimal policy encodes the reward in its log-ratio to pi_ref.')
10. Numerical Optimisation: Newton's Method and Interior Points
Code cell 40
# === 10.1 Newton's Method for Unconstrained Optimisation ===
# Newton step: x_{k+1} = x_k - H(x_k)^{-1} grad f(x_k)
# Quadratic convergence near non-degenerate minimum
def newton_method(f, x0, max_iter=20, tol=1e-10):
"""Newton's method with quadratic convergence tracking."""
x = x0.copy().astype(float)
history = [{'x': x.copy(), 'f': f(x), 'grad_norm': la.norm(grad_fd(f,x))}]
for k in range(max_iter):
g = grad_fd(f, x)
if la.norm(g) < tol:
break
H = hessian_fd(f, x)
try:
dx = -la.solve(H, g)
except la.LinAlgError:
break
x = x + dx
history.append({'x': x.copy(), 'f': f(x), 'grad_norm': la.norm(grad_fd(f,x))})
return x, history
# Test on Rosenbrock (challenging non-convex for GD, easy for Newton)
f_rosen = lambda x: (1-x[0])**2 + 100*(x[1]-x[0]**2)**2
x0 = np.array([-1.0, 1.0])
x_star_newton, history_newton = newton_method(f_rosen, x0)
print('=== Newton\'s Method on Rosenbrock ===')
print(f'x0 = {x0}')
print(f'x* = {x_star_newton.round(8)}')
print(f'f(x*) = {f_rosen(x_star_newton):.2e}')
print(f'Iterations: {len(history_newton)-1}')
print('\nConvergence (gradient norm):')
for i, h in enumerate(history_newton):
print(f' k={i}: ||grad||={h["grad_norm"]:.2e}')
print('\nNote: quadratic convergence in the final phase.')
Code cell 41
# === 10.2 Newton vs Gradient Descent: Convergence Comparison ===
# For a quadratic f(x) = 0.5 x^T Q x - b^T x with Q = diag(1, kappa)
# GD: needs O(kappa) iterations; Newton: 1 step (exact for quadratic)
def gd_history(f, grad_f, x0, lr, n_steps):
x = x0.copy()
h = [la.norm(grad_f(x))]
for _ in range(n_steps):
x = x - lr * grad_f(x)
h.append(la.norm(grad_f(x)))
return h
kappa_vals = [10, 50, 200]
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for ax, kappa in zip(axes, kappa_vals):
Q = np.diag([1.0, float(kappa)])
b = np.array([1.0, 1.0])
f_quad = lambda x: 0.5 * x @ Q @ x - b @ x
grad_quad = lambda x: Q @ x - b
x0 = np.array([2.0, 2.0])
lr_gd = 2.0 / (1.0 + kappa) # optimal step size
h_gd = gd_history(f_quad, grad_quad, x0, lr_gd, 200)
_, h_newton = newton_method(f_quad, x0)
h_newton_norms = [h['grad_norm'] for h in h_newton]
ax.semilogy(h_gd, 'b-', label='Gradient Descent', linewidth=2)
ax.semilogy(h_newton_norms, 'r-o', label='Newton', linewidth=2, markersize=6)
ax.set_title(f'kappa={kappa} (cond number)')
ax.set_xlabel('Iteration'); ax.set_ylabel('||grad||')
ax.legend(); ax.grid(True, alpha=0.4)
plt.suptitle('Newton vs GD: Convergence on Quadratic', fontsize=13)
plt.tight_layout(); plt.show()
11. Convex Optimisation in Practice: CVXPY
CVXPY provides a disciplined convex programming interface that automatically verifies convexity and solves problems using interior point methods.
Code cell 43
# === 11.1 Portfolio Optimisation with CVXPY ===
try:
import cvxpy as cp
HAS_CVXPY = True
except ImportError:
HAS_CVXPY = False
print('cvxpy not available — showing manual solution instead')
np.random.seed(42)
n_assets = 5
# Random covariance matrix (PD)
A = np.random.randn(n_assets, n_assets)
Sigma_pf = A.T @ A / n_assets + 0.1 * np.eye(n_assets)
mu_pf = np.random.randn(n_assets) * 0.1 + 0.05 # expected returns
if HAS_CVXPY:
w = cp.Variable(n_assets)
risk = cp.quad_form(w, Sigma_pf)
ret_target = 0.06
constraints = [
cp.sum(w) == 1,
mu_pf @ w >= ret_target,
w >= 0
]
prob = cp.Problem(cp.Minimize(risk), constraints)
prob.solve(solver=cp.OSQP, eps_abs=1e-8)
print('=== CVXPY Portfolio Optimisation ===')
print(f'Status: {prob.status}')
print(f'Optimal weights: {w.value.round(4)}')
print(f'Expected return: {mu_pf @ w.value:.4f} (target: {ret_target})')
print(f'Portfolio risk (var): {risk.value:.6f}')
print(f'Sum of weights: {w.value.sum():.6f}')
# Shadow prices from dual variables
if constraints[0].dual_value is not None:
print(f'\nDual (shadow prices):')
print(f' Budget constraint: {constraints[0].dual_value:.4f}')
print(f' Return constraint: {constraints[1].dual_value:.4f}')
print(f' (Return multiplier = cost of requiring higher return)')
else:
# Manual: solve via KKT for long-only with budget and return constraints
# Using scipy.optimize
from scipy.optimize import minimize as spmin
def portfolio_risk(w):
return w @ Sigma_pf @ w
cons = [{'type':'eq','fun': lambda w: w.sum()-1},
{'type':'ineq','fun': lambda w: mu_pf @ w - 0.06}]
bounds = [(0, None)] * n_assets
res = spmin(portfolio_risk, np.ones(n_assets)/n_assets,
constraints=cons, bounds=bounds, method='SLSQP')
print('=== scipy Portfolio Optimisation ===')
print(f'Weights: {res.x.round(4)}')
print(f'Risk: {res.fun:.6f}')
Code cell 44
# === 11.2 Lasso via CVXPY vs Proximal Gradient ===
np.random.seed(42)
n_lasso, d_lasso = 50, 20
X_lasso = np.random.randn(n_lasso, d_lasso)
w_sparse = np.zeros(d_lasso)
w_sparse[[0, 5, 12]] = [1.5, -2.0, 0.8]
y_lasso = X_lasso @ w_sparse + 0.1*np.random.randn(n_lasso)
lam_lasso = 0.1
# Proximal gradient descent for Lasso
def soft_threshold(v, threshold):
return np.sign(v) * np.maximum(np.abs(v) - threshold, 0)
L_lip = la.norm(X_lasso.T @ X_lasso / n_lasso, 2) # Lipschitz constant
w_lasso = np.zeros(d_lasso)
losses = []
for _ in range(500):
grad_smooth = X_lasso.T @ (X_lasso @ w_lasso - y_lasso) / n_lasso
w_lasso = soft_threshold(w_lasso - grad_smooth / L_lip, lam_lasso / L_lip)
losses.append(0.5*la.norm(X_lasso @ w_lasso - y_lasso)**2/n_lasso + lam_lasso*la.norm(w_lasso,1))
print('=== Lasso via Proximal Gradient ===')
print(f'True sparse weights: {w_sparse}')
print(f'Lasso solution: {w_lasso.round(4)}')
print(f'Nonzero entries: {np.sum(np.abs(w_lasso) > 1e-3)}')
print(f'Final loss: {losses[-1]:.6f}')
# Verify KKT: for nonzero weights, |X_j^T r / n| = lambda
# for zero weights, |X_j^T r / n| <= lambda
residual = y_lasso - X_lasso @ w_lasso
correlations = np.abs(X_lasso.T @ residual / n_lasso)
active = np.abs(w_lasso) > 1e-3
print(f'\nKKT verification:')
print(f'Active features: {np.where(active)[0]}')
print(f'Max |corr| for inactive: {correlations[~active].max():.4f} (should be <= {lam_lasso})')
print(f'|corr| for active: {correlations[active].round(4)} (should be ~{lam_lasso})')
Code cell 45
# === 11.3 Efficient Frontier via KKT ===
# The efficient frontier is parameterised by the Lagrange multiplier lambda_1
# (required return). As lambda_1 varies, we trace all Pareto-optimal portfolios.
n_ef = 5
np.random.seed(0)
A_ef = np.random.randn(n_ef, n_ef)
Sig = A_ef.T @ A_ef / n_ef + 0.1*np.eye(n_ef)
mu_ef = np.array([0.03, 0.07, 0.05, 0.09, 0.04])
def min_var_portfolio(r_target):
"""KKT solution: min w^T Sig w s.t. mu^T w=r, 1^T w=1."""
from scipy.optimize import minimize as spmin
n = len(mu_ef)
cons = [{'type':'eq','fun': lambda w: mu_ef @ w - r_target},
{'type':'eq','fun': lambda w: w.sum() - 1}]
res = spmin(lambda w: w @ Sig @ w, np.ones(n)/n,
constraints=cons, method='SLSQP', tol=1e-10)
if res.success:
return res.x, res.fun
return None, None
r_range = np.linspace(0.035, 0.085, 50)
vols = []
for r in r_range:
w, var = min_var_portfolio(r)
if w is not None and var is not None:
vols.append(np.sqrt(var))
else:
vols.append(np.nan)
vols = np.array(vols)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(8, 6))
mask = ~np.isnan(vols)
ax.plot(vols[mask]*100, r_range[mask]*100, 'b-', linewidth=2)
for i, (m, name) in enumerate(zip(mu_ef, ['A','B','C','D','E'])):
sig_i = np.sqrt(Sig[i,i])
ax.scatter(sig_i*100, m*100, s=80, zorder=5)
ax.annotate(name, (sig_i*100+0.1, m*100))
ax.set_xlabel('Volatility (%)')
ax.set_ylabel('Expected Return (%)')
ax.set_title('Efficient Frontier via KKT')
ax.grid(True, alpha=0.4)
plt.tight_layout(); plt.show()
print('Each point on the frontier is the KKT solution for a specific target return.')
print('Moving along the frontier = changing the Lagrange multiplier lambda_1.')
12. Gradient Flow and Continuous-Time Optimality
Code cell 47
# === 12.1 Gradient Flow: Continuous-Time Limit of GD ===
# GD: x_{k+1} = x_k - eta * grad f(x_k)
# As eta -> 0: dx/dt = -grad f(x) (gradient flow ODE)
# At equilibrium: grad f(x*) = 0 <=> FONC
from scipy.integrate import solve_ivp
# Gradient flow on f(x,y) = x^2 + 2y^2 (bowl)
f_bowl2 = lambda xy: xy[0]**2 + 2*xy[1]**2
grad_bowl2 = lambda t, xy: [-2*xy[0], -4*xy[1]]
# Exact solution: x(t) = x0 * exp(-2t), y(t) = y0 * exp(-4t)
x0_flow = np.array([1.5, 1.0])
t_span = (0, 3)
t_eval = np.linspace(0, 3, 200)
sol = solve_ivp(grad_bowl2, t_span, x0_flow, t_eval=t_eval, rtol=1e-8)
x_flow = sol.y[0]; y_flow = sol.y[1]
# Verify: reaches origin (critical point = FONC)
print('=== Gradient Flow Convergence ===')
print(f'Start: ({x0_flow[0]}, {x0_flow[1]})')
print(f'End: ({x_flow[-1]:.6f}, {y_flow[-1]:.6f}) (should be ~(0,0))')
print(f'f(start)={f_bowl2(x0_flow):.4f}, f(end)={f_bowl2(np.array([x_flow[-1],y_flow[-1]])):.2e}')
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(t_eval, x_flow, 'b-', label='x(t) = x0 e^{-2t}')
axes[0].plot(t_eval, y_flow, 'r-', label='y(t) = y0 e^{-4t}')
axes[0].plot(t_eval, x0_flow[0]*np.exp(-2*t_eval), 'b--', alpha=0.5, label='exact x')
axes[0].plot(t_eval, x0_flow[1]*np.exp(-4*t_eval), 'r--', alpha=0.5, label='exact y')
axes[0].set_xlabel('t'); axes[0].set_ylabel('value')
axes[0].set_title('Gradient Flow: Coordinates vs Time')
axes[0].legend()
axes[1].plot(x_flow, y_flow, 'g-', linewidth=2)
axes[1].plot(x0_flow[0], x0_flow[1], 'bs', markersize=10, label='Start')
axes[1].plot(0, 0, 'r*', markersize=14, label='Critical point')
xr = np.linspace(-0.2, 1.7, 100)
for c in [0.5, 1, 2, 3, 4]:
theta = np.linspace(0, 2*np.pi, 200)
ax2_x = np.sqrt(c)*np.cos(theta)
ax2_y = np.sqrt(c/2)*np.sin(theta)
axes[1].plot(ax2_x, ax2_y, 'k-', alpha=0.15)
axes[1].set_xlabel('x'); axes[1].set_ylabel('y')
axes[1].set_title('Gradient Flow Trajectory')
axes[1].legend()
plt.suptitle('Gradient flow: $\\dot{x} = -\\nabla f(x)$', fontsize=13)
plt.tight_layout(); plt.show()
Code cell 48
# === 12.2 Lyapunov Analysis: Loss is a Lyapunov Function ===
# For gradient flow dx/dt = -grad f:
# d/dt f(x(t)) = grad f(x)^T dx/dt = -||grad f(x)||^2 <= 0
# So f is monotonically decreasing along gradient flow (it's a Lyapunov function)
# Verify numerically and visualise
f_along_flow = np.array([f_bowl2(np.array([x_flow[i], y_flow[i]])) for i in range(len(t_eval))])
grad_norm_sq = np.array([(2*x_flow[i])**2 + (4*y_flow[i])**2 for i in range(len(t_eval))])
# Numerical derivative of f along flow
df_dt_numerical = np.gradient(f_along_flow, t_eval)
df_dt_theoretical = -grad_norm_sq # should match
print('=== Lyapunov Analysis: f Decreasing Along Gradient Flow ===')
print(f'f(0) = {f_along_flow[0]:.4f}')
print(f'f(T) = {f_along_flow[-1]:.2e}')
print(f'f is monotonically decreasing: {np.all(np.diff(f_along_flow) <= 1e-10)}')
# Verify df/dt = -||grad f||^2
mid = len(t_eval)//2
print(f'At t={t_eval[mid]:.2f}: df/dt_numerical={df_dt_numerical[mid]:.4f}, -||grad||^2={df_dt_theoretical[mid]:.4f}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 4))
ax.semilogy(t_eval, f_along_flow, 'b-', linewidth=2, label='f(x(t)) (loss)')
ax.set_xlabel('t'); ax.set_ylabel('f(x(t))')
ax.set_title('Loss is Lyapunov Function: $\\frac{d}{dt}f(x(t)) = -\\|\\nabla f\\|^2 \\leq 0$')
ax.grid(True, alpha=0.4)
plt.tight_layout(); plt.show()
13. Numerical KKT Verification and Gradient Checking
Code cell 50
# === 13.1 Complete KKT System Solver and Verifier ===
# Demonstrate full KKT analysis for a QP
# min 0.5 ||Ax-b||^2 + 0.5*lambda*||x||^2 s.t. Cx <= d, Ex = f
np.random.seed(42)
n_kkt, m_ineq, m_eq = 4, 2, 1
A_kkt = np.random.randn(6, n_kkt)
b_kkt = np.random.randn(6)
C_kkt = np.array([[1., 1., 0., 0.], [0., 0., 1., 1.]]) # x1+x2<=1, x3+x4<=1
d_kkt = np.array([1.0, 1.0])
E_kkt = np.array([[1., -1., 1., -1.]]) # x1-x2+x3-x4=0
f_kkt = np.array([0.0])
lam_kkt = 0.1
def qp_objective(x):
return 0.5*la.norm(A_kkt @ x - b_kkt)**2 + 0.5*lam_kkt*la.norm(x)**2
def qp_objective_grad(x):
return A_kkt.T @ (A_kkt @ x - b_kkt) + lam_kkt * x
from scipy.optimize import minimize as spmin
cons = [{'type':'ineq','fun': lambda x: d_kkt - C_kkt @ x}, # d - Cx >= 0
{'type':'eq','fun': lambda x: E_kkt @ x - f_kkt}]
x0 = np.zeros(n_kkt)
res = spmin(qp_objective, x0, jac=qp_objective_grad,
method='SLSQP', constraints=cons, tol=1e-10)
x_opt = res.x
print('=== Full KKT Analysis of Constrained QP ===')
print(f'Optimal x* = {x_opt.round(4)}')
print(f'Objective f(x*) = {qp_objective(x_opt):.6f}')
print(f'Ineq constraints: {(C_kkt @ x_opt - d_kkt).round(4)}')
print(f'Eq constraint: {(E_kkt @ x_opt - f_kkt).round(4)}')
# Gradient at optimal
g_opt = qp_objective_grad(x_opt)
print(f'||grad f(x*)|| = {la.norm(g_opt):.2e}')
print()
# Active constraint analysis
h_vals = d_kkt - C_kkt @ x_opt # slack
for j, (h, c_name) in enumerate(zip(h_vals, ['x1+x2<=1','x3+x4<=1'])):
status = 'ACTIVE' if abs(h) < 1e-4 else f'inactive (slack={h:.4f})'
print(f' Constraint {c_name}: {status}')
# Gradient check
g_fd = grad_fd(qp_objective, x_opt)
g_analytic = qp_objective_grad(x_opt)
print(f'\nGradient check: ||g_analytic - g_fd|| = {la.norm(g_analytic-g_fd):.2e}')
Code cell 51
# === 13.2 Summary Statistics and Key Takeaways ===
print('='*60)
print('OPTIMALITY CONDITIONS — KEY RESULTS SUMMARY')
print('='*60)
print()
print('1. FONC: grad f(x*) = 0')
print(' - Necessary for any local extremum')
print(' - Sufficient for global min when f is convex')
print()
print('2. SOSC: H(x*) > 0 (strict local min)')
print(' - Hessian PD => strict local minimum')
print(' - Morse index = # negative eigenvalues')
print()
print('3. KKT: generalises to constrained problems')
print(' - 4 conditions: stationarity, primal/dual feasibility, CS')
print(' - Lagrange multiplier = shadow price of constraint')
print()
print('4. Duality: d* <= p* (weak); d* = p* (strong, Slater)')
print(' - Duality gap = certificate of optimality')
print()
print('5. ML Applications:')
print(' - Normal equations: FONC of OLS')
print(' - PCA: KKT of variance-max on sphere => eigenvectors')
print(' - SVM: dual via KKT => kernel trick + support vectors')
print(' - Softmax: KKT of max-entropy under logit constraints')
print(' - RLHF: KKT of KL-constrained reward max => Boltzmann policy')
print(' - SAM: KKT of sharpness inner problem => perturbation formula')
print(' - Neural collapse: KKT of UFM => ETF geometry')
print()
print('Ready for exercises.ipynb!')
10. Summary
This notebook has demonstrated the core optimality conditions through computation:
| Section | Key Result |
|---|---|
| §1 First-order | necessary; visualised for 1D and 2D |
| §2 Second-order | Hessian eigenvalues classify: PD→min, ND→max, indefinite→saddle |
| §3 Convexity | everywhere iff convex; strong convexity gives linear convergence |
| §4 Lagrange | PCA is ; shadow price = multiplier |
| §5 KKT | 4 conditions verified; complementary slackness visualised |
| §6 Duality | SVM primal = dual (strong duality); LP barrier convergence |
| §7 ML apps | Normal equations, logistic convexity, softmax as max-entropy |
| §8 DL landscape | Saddle dominance, SAM KKT derivation, neural collapse ETF |
| §9 RLHF/DPO | Boltzmann policy as KKT; DPO implicit reward recovery |