Exercises NotebookMath for LLMs

Optimality Conditions

Multivariate Calculus / Optimality Conditions

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Optimality Conditions - Exercises

10 graded exercises covering the full section arc, from multivariate calculus mechanics to ML-facing gradient systems.

FormatDescription
ProblemMarkdown cell with task description
Your SolutionCode cell for learner work
SolutionReference solution with checks

Difficulty: straightforward -> moderate -> challenging.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize

try:
    import torch
    HAS_TORCH = True
except ImportError:
    torch = None
    HAS_TORCH = False


def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok

def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

def sigmoid(x):
    x = np.asarray(x, dtype=float)
    return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z, axis=axis, keepdims=True)
    e = np.exp(z)
    return e / np.sum(e, axis=axis, keepdims=True)

def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return np.where(np.asarray(x) > 0, 1.0, 0.0)

def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def numerical_gradient(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        xp = x.copy(); xm = x.copy()
        xp[idx] += h; xm[idx] -= h
        grad[idx] = (f(xp) - f(xm)) / (2*h)
    return grad

def numerical_jacobian(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
    return J.reshape(y0.shape + x.shape)

def grad_check(f, x, analytic_grad, h=1e-6):
    numeric_grad = numerical_gradient(f, x, h=h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def jacobian_fd(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
        ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
        J[:, j] = (yp - ym) / (2*h)
    return J.reshape(y0.shape + x.shape)

def hessian_fd(f, x, h=1e-5):
    x = np.asarray(x, dtype=float)
    H = np.zeros((x.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
        gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
        H[:, j] = (gp - gm) / (2*h)
    return H.reshape(x.shape + x.shape)



def grad_fd(f, x, h=1e-6):
    return numerical_gradient(f, x, h=h)



def fd_grad(f, x, h=1e-6):
    return numerical_gradient(f, np.asarray(x, dtype=float), h=h)

print("Chapter helper setup complete.")

Exercise 1 (★): Critical Point Classification

Let f(x,y)=3x2yy33x2+3f(x,y) = 3x^2y - y^3 - 3x^2 + 3 (a monkey saddle-related surface).

(a) Find all critical points by solving nablaf=mathbf0\\nabla f = \\mathbf{0}.

(b) Compute the Hessian at each critical point and classify using the det(H)\\det(H) test.

(c) For the minimum (if any), verify: run gradient descent from a nearby point and confirm it converges to the critical point.

(d) Compute the condition number of the Hessian at any strict local minimum found.

Code cell 5

# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")

Code cell 6

# Solution
# Exercise 1 - reference solution

def f1(xy):
    x, y = xy
    return 3*x**2*y - y**3 - 3*x**2 + 3

# (a) Critical points
# df/dx = 6x(y-1) = 0: x=0 or y=1
# df/dy = 3(x^2-y^2) = 0: x=y or x=-y
# Case 1: x=0 => from df/dy: y=0. Critical point (0,0)
# Case 2: y=1, x=y=1 => (1,1);  y=1, x=-y=-1 => (-1,1)
critical_pts = [np.array([0., 0.]), np.array([1., 1.]), np.array([-1., 1.])]

def classify_2d(f, xy_arr):
    H = hessian_fd(f, xy_arr)
    det_H = la.det(H)
    h11 = H[0, 0]
    if det_H > 0 and h11 > 0: typ = 'LOCAL MIN'
    elif det_H > 0 and h11 < 0: typ = 'LOCAL MAX'
    elif det_H < 0: typ = 'SADDLE'
    else: typ = 'INCONCLUSIVE'
    eigs = la.eigvalsh(H)
    return typ, H, det_H, eigs

header('Exercise 1: Critical Point Classification')
for pt in critical_pts:
    g = grad_fd(f1, pt)
    typ, H, det_H, eigs = classify_2d(f1, pt)
    print(f'  x*={pt}: ||grad||={la.norm(g):.2e}, det(H)={det_H:.4f}, type={typ}')
    print(f'    eigenvalues={eigs.round(4)}')

# (c) GD from near (1,1)
w = np.array([1.2, 1.1])
for _ in range(5000):
    g = grad_fd(f1, w)
    if la.norm(g) < 1e-8: break
    w -= 0.001 * g

print(f'\n(c) GD from (1.2,1.1) converges to: {w.round(6)}')
print(f'    Expected: (1,1) or (-1,1)')
check_true('(c) GD finds a local min', la.norm(grad_fd(f1,w)) < 1e-4)

# (d) Condition number at (1,1)
_, H11, _, eigs11 = classify_2d(f1, np.array([1., 1.]))
kappa = abs(eigs11.max()) / abs(eigs11.min())
print(f'\n(d) Condition number at (1,1): {kappa:.4f}')

print('\nTakeaway: det(H) test classifies ℝ² critical points; condition number measures convergence difficulty.')

print("Exercise 1 solution complete.")

Exercise 2 (★): Lagrange Multipliers: AM-GM Inequality

Maximise f(x1,x2,x3)=x1x2x3f(x_1, x_2, x_3) = x_1 x_2 x_3 subject to x1+x2+x3=cx_1 + x_2 + x_3 = c, mathbfxgeqmathbf0\\mathbf{x} \\geq \\mathbf{0}.

(a) Write the Lagrangian and derive the KKT stationarity conditions.

(b) Show analytically that the maximum is f=(c/3)3f^* = (c/3)^3 and occurs at x1=x2=x3=c/3x_1^* = x_2^* = x_3^* = c/3.

(c) Verify numerically for c=12c = 12: the maximum should be f=64f^* = 64.

(d) Interpret the Lagrange multiplier: compute lambda\\lambda^* and verify dp/dc=lambdadp^*/dc = \\lambda^* via finite differences.

(e) Derive the AM-GM inequality from this: fracx1+x2+x33geq(x1x2x3)1/3\\frac{x_1+x_2+x_3}{3} \\geq (x_1 x_2 x_3)^{1/3}.

Code cell 8

# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")

Code cell 9

# Solution
# Exercise 2 - reference solution

c = 12.0

# (b) KKT stationarity: x2*x3 + lambda = 0, x1*x3 + lambda = 0, x1*x2 + lambda = 0
# => x2*x3 = x1*x3 = x1*x2 => all products equal => x1=x2=x3
# Constraint: 3*x1 = c => x1* = c/3
x_star = np.array([c/3, c/3, c/3])
f_star = np.prod(x_star)  # (c/3)^3

# Lambda* from stationarity: x2*x3 + lambda = 0 => lambda* = -(c/3)^2
lambda_star = -(c/3)**2

header('Exercise 2: Lagrange Multipliers and AM-GM')
print(f'(b) x* = {x_star}, f* = {f_star:.4f}')
check_close('(b) f* = (c/3)^3', f_star, (c/3)**3)

# (c) Numerical verification
def f2_neg(x): return -x[0]*x[1]*x[2]
cons = [{'type':'eq','fun': lambda x: x.sum()-c}]
bounds = [(1e-8, None)]*3
res = minimize(f2_neg, [c/3]*3, constraints=cons, bounds=bounds, tol=1e-12)
print(f'(c) Numerical max: {-res.fun:.6f}  (expected: {f_star:.6f})')
check_close('(c) numerical matches analytic', -res.fun, f_star, tol=1e-4)

# (d) Shadow price: dp*/dc
# p*(c) = (c/3)^3, dp*/dc = 3*(c/3)^2/3 = (c/3)^2
def p_star_c(c_val):
    return (c_val/3)**3

dp_dc_analytic = (c/3)**2  # analytical derivative
dp_dc_fd = (p_star_c(c+1e-5) - p_star_c(c-1e-5)) / (2e-5)
print(f'\n(d) dp*/dc = {dp_dc_analytic:.4f} (analytic)')
print(f'    dp*/dc = {dp_dc_fd:.4f} (FD)')
print(f'    -lambda* = {-lambda_star:.4f}  (shadow price)')
check_close('(d) -lambda* = dp*/dc', -lambda_star, dp_dc_analytic)

# (e) AM-GM: by definition, max of product on simplex gives (c/3)^3
# For any x1,x2,x3 >= 0 with x1+x2+x3=c: x1*x2*x3 <= (c/3)^3
# Dividing by c^3: (x1/c)(x2/c)(x3/c) <= (1/3)^3
# With s=c, pi = xi/c: p1*p2*p3 <= 1/27  (pi >= 0, sum=1)
# => (x1+x2+x3)/3 >= (x1*x2*x3)^(1/3)  [cube root both sides of pi form]
test_vecs = [np.array([1.,2.,3.]), np.array([0.5, 5., 1.5]), np.array([4.,4.,4.])]
print('\n(e) AM-GM verification:')
for v in test_vecs:
    am = v.mean()
    gm = np.prod(v)**(1/3)
    print(f'  x={v}: AM={am:.4f}, GM={gm:.4f}, AM>=GM: {am >= gm - 1e-10}')

print('\nTakeaway: The Lagrange multiplier is the marginal value of relaxing the constraint.')
print('AM-GM is a direct consequence of the constrained max being (c/3)^3.')

print("Exercise 2 solution complete.")

Exercise 3 (★): KKT Conditions for a QP

Solve: minx1,x2frac12(x12+x22)\\min_{x_1, x_2} \\frac{1}{2}(x_1^2 + x_2^2) subject to x1+x2geq3x_1 + x_2 \\geq 3, x1geq0x_1 \\geq 0, x2geq0x_2 \\geq 0.

(a) Rewrite in standard form (hj(mathbfx)leq0h_j(\\mathbf{x}) \\leq 0) and write the full Lagrangian.

(b) Write out all four KKT conditions. Solve the KKT system analytically.

(c) Verify global optimality: check that the problem is convex.

(d) Compute the duality gap (should be zero by strong duality).

(e) Show that the shadow price mu1\\mu_1^* gives dp/db1dp^*/db_1 where b1=3b_1 = 3 is the RHS of the first constraint.

Code cell 11

# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")

Code cell 12

# Solution
# Exercise 3 - reference solution

# (b) Analytical KKT solution:
# Guess: h1 active (x1+x2=3), h2 inactive (mu2=0), h3 inactive (mu3=0)
# Stationarity: x1 = mu1, x2 = mu1 => x1 = x2
# Constraint: 2*x1 = 3 => x1* = x2* = 1.5
# mu1* = x1* = 1.5
# Check: mu1 > 0 (dual feas ok), h2=-1.5<0 (primal feas ok)

x_star = np.array([1.5, 1.5])
mu1_star = 1.5
mu2_star = 0.0
mu3_star = 0.0

header('Exercise 3: KKT for QP')

# Verify all four KKT conditions
# (1) Stationarity
stat_x1 = x_star[0] - mu1_star - mu2_star
stat_x2 = x_star[1] - mu1_star - mu3_star
check_close('(1) Stationarity x1', stat_x1, 0.0)
check_close('(1) Stationarity x2', stat_x2, 0.0)

# (2) Primal feasibility
h1 = -(x_star[0]+x_star[1]) + 3
h2, h3 = -x_star[0], -x_star[1]
check_true('(2) h1 <= 0', h1 <= 1e-10)
check_true('(2) h2 <= 0', h2 <= 1e-10)
check_true('(2) h3 <= 0', h3 <= 1e-10)

# (3) Dual feasibility
check_true('(3) mu1 >= 0', mu1_star >= 0)
check_true('(3) mu2 >= 0', mu2_star >= 0)
check_true('(3) mu3 >= 0', mu3_star >= 0)

# (4) Complementary slackness
check_close('(4) CS: mu1*h1', mu1_star*h1, 0.0)
check_close('(4) CS: mu2*h2', mu2_star*h2, 0.0)

primal_val = 0.5*(x_star[0]**2 + x_star[1]**2)
print(f'\n(c) Primal f* = {primal_val:.4f}')
print('Problem is convex (quadratic obj, linear constraints) => KKT = global optimum')

# (d) Duality gap
# Lagrangian dual function: q(mu) = inf_{x>=0} [0.5(x1^2+x2^2) + mu1*(3-x1-x2)]
# = 3*mu1 + min_{x1>=0} [0.5*x1^2 - mu1*x1] + min_{x2>=0} [0.5*x2^2 - mu1*x2]
# = 3*mu1 - 0.5*mu1^2 - 0.5*mu1^2 = 3*mu1 - mu1^2
# d* = max_{mu1>=0} (3*mu1 - mu1^2): d/dmu1 = 3 - 2*mu1 = 0 => mu1*=1.5, d*=3*1.5-1.5^2=4.5-2.25=2.25
dual_val = 3*mu1_star - mu1_star**2
gap = abs(primal_val - dual_val)
print(f'(d) Primal val={primal_val:.4f}, Dual val={dual_val:.4f}, Gap={gap:.2e}')
check_close('(d) Strong duality (zero gap)', gap, 0.0, tol=1e-8)

# (e) Shadow price
def p_star_b(b):
    # Same QP with x1+x2>=b: x1*=x2*=b/2, p*=b^2/4
    return (b/2)**2

b0 = 3.0
dp_db_fd = (p_star_b(b0+1e-5) - p_star_b(b0-1e-5)) / (2e-5)
dp_db_analytic = b0/2  # derivative of b^2/4
print(f'\n(e) dp*/db|_{{b=3}} = {dp_db_analytic:.4f}')
print(f'    mu1* = {mu1_star:.4f}  (should match dp*/db)')
check_close('(e) Shadow price = dp*/db', mu1_star, dp_db_analytic)

print('\nTakeaway: KKT gives exact solution for convex QP; shadow price = dp*/db.')

print("Exercise 3 solution complete.")

Exercise 4 (★★): Convexity Analysis

For each function, determine if it is convex, strictly convex, concave, or neither. Verify your answer via second-order conditions.

(a) f(x)=exx1f(x) = e^x - x - 1 on mathbbR\\mathbb{R}

(b) f(mathbfx)=mathbfx22+mathbfx1f(\\mathbf{x}) = \\|\\mathbf{x}\\|_2^2 + \\|\\mathbf{x}\\|_1 on mathbbRn\\mathbb{R}^n

(c) f(x)=log(1+ex)f(x) = \\log(1 + e^x) (log-sigmoid) on mathbbR\\mathbb{R}

(d) f(x,y)=x2/yf(x, y) = x^2/y for y>0y > 0 (perspective of x2x^2)

(e) f(mathbfw)=Amathbfwmathbfb2lambdamathbfw2f(\\mathbf{w}) = \\|A\\mathbf{w} - \\mathbf{b}\\|^2 - \\lambda\\|\\mathbf{w}\\|^2 for \\lambda > \\lambda_{\\max}(A^\\top A)

Code cell 14

# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")

Code cell 15

# Solution
# Exercise 4 - reference solution

header('Exercise 4: Convexity Analysis')

# (a) f(x) = exp(x) - x - 1: f''(x) = exp(x) > 0 everywhere
print('(a) f(x) = exp(x) - x - 1')
xs = np.linspace(-3, 3, 10)
f_second_deriv = np.exp(xs)  # f''(x) = exp(x)
print(f'  f\'\' range: [{f_second_deriv.min():.4f}, {f_second_deriv.max():.4f}]  (all > 0)')
check_true('(a) strictly convex (f\'\'> 0)', np.all(f_second_deriv > 0))

# (b) f(x) = ||x||^2 + ||x||_1 (smooth approx for check)
print('\n(b) f(x) = ||x||^2 + ||x||_1')
eps = 0.1  # small smoothing to avoid nondiff at 0
fb = lambda x: la.norm(x)**2 + la.norm(x, 1)
for pt in [np.array([1.,2.,3.]), np.array([-1.,0.5,2.])]:
    H = hessian_fd(fb, pt+eps*np.ones_like(pt))
    eigs = la.eigvalsh(H)
    print(f'  at x={pt}: min_eig={eigs.min():.4f}')
check_true('(b) convex (sum of convex)', True)

# (c) f(x) = log(1+exp(x))  [log-sigmoid]
print('\n(c) f(x) = log(1+exp(x))')
fc = lambda x: np.log(1 + np.exp(x[0]))
xs_c = np.linspace(-3, 3, 20)
sec_deriv_c = np.array([hessian_fd(fc, np.array([xi]))[0,0] for xi in xs_c])
print(f'  f\'\' range: [{sec_deriv_c.min():.4f}, {sec_deriv_c.max():.4f}]')
check_true('(c) convex (f\'\' >= 0)', np.all(sec_deriv_c >= -1e-8))
print('  f\'\' = sigmoid(x)*(1-sigmoid(x)) in [0, 0.25]')

# (d) f(x,y) = x^2/y (y>0) — perspective function
print('\n(d) f(x,y) = x^2/y  (perspective of x^2)')
fd2 = lambda v: v[0]**2 / v[1]
pts_d = [np.array([1., 2.]), np.array([0., 1.]), np.array([3., 0.5])]
for pt in pts_d:
    H = hessian_fd(fd2, pt)
    eigs = la.eigvalsh(H)
    print(f'  at ({pt[0]},{pt[1]}): eigs={eigs.round(4)}, min_eig={eigs.min():.4f}')
check_true('(d) convex (perspective of convex is convex)', True)

# (e) f(w) = ||Aw-b||^2 - lambda*||w||^2
print('\n(e) f(w) = ||Aw-b||^2 - lambda*||w||^2 with lambda > lambda_max(A^TA)')
np.random.seed(42)
A_e, b_e = np.random.randn(5,3), np.random.randn(5)
lmax = la.eigvalsh(A_e.T @ A_e).max()
# Hessian = 2*A^T A - 2*lambda*I
# For lambda > lambda_max: 2*(A^T A - lambda*I) has all eigenvalues < 0 => concave!
for lam_factor in [0.5, 1.0, 2.0]:
    lam_test = lam_factor * lmax
    H_e = 2*(A_e.T @ A_e) - 2*lam_test*np.eye(3)
    eigs_e = la.eigvalsh(H_e)
    status = 'CONVEX' if np.all(eigs_e >= -1e-8) else 'CONCAVE' if np.all(eigs_e <= 1e-8) else 'NEITHER'
    print(f'  lambda={lam_factor:.1f}*lambda_max: min_eig={eigs_e.min():.4f}, status={status}')
check_true('(e) concave when lambda > lambda_max', True)

print('\nTakeaway: Convexity is preserved under addition, scaling, and composition rules.')
print('Second-order check (H >= 0) is the most systematic verification method.')

print("Exercise 4 solution complete.")

Exercise 5 (★★): SVM Dual Derivation

Derive the hard-margin SVM dual from scratch and verify strong duality.

(a) Generate linearly separable 2D data (10 positive, 10 negative samples).

(b) Implement the SVM primal and solve with scipy.optimize.minimize.

(c) Form the Lagrangian, derive the dual function q(boldsymbolalpha)q(\\boldsymbol{\\alpha}), and implement the dual.

(d) Solve the dual and verify p=dp^* = d^* (strong duality, zero gap).

(e) Identify support vectors via complementary slackness alphai>0Leftrightarrowyi(mathbfwtopmathbfxi+b)=1\\alpha_i^* > 0 \\Leftrightarrow y_i(\\mathbf{w}^{*\\top}\\mathbf{x}_i + b^*) = 1.

Code cell 17

# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")

Code cell 18

# Solution
# Exercise 5 - reference solution

np.random.seed(0)
n_pos5, n_neg5 = 10, 10
X_pos5 = np.random.randn(n_pos5, 2) + np.array([2., 0.])
X_neg5 = np.random.randn(n_neg5, 2) + np.array([-2., 0.])
X5 = np.vstack([X_pos5, X_neg5])
y5 = np.hstack([np.ones(n_pos5), -np.ones(n_neg5)])
n5 = len(y5)

# (b) Primal
def svm_primal_obj(params):
    return 0.5 * la.norm(params[:2])**2

def svm_primal_grad(params):
    g = np.zeros(3); g[:2] = params[:2]; return g

cons5 = [{'type':'ineq','fun': lambda p: y5*(X5 @ p[:2] + p[2]) - 1,
           'jac': lambda p: np.column_stack([np.outer(y5, np.ones(2))*X5, y5])}]
res5p = minimize(svm_primal_obj, np.zeros(3), jac=svm_primal_grad,
                  method='SLSQP', constraints=cons5, tol=1e-10)
w5, b5 = res5p.x[:2], res5p.x[2]
p_star5 = 0.5 * la.norm(w5)**2

# (c) Dual: q(alpha) = sum(alpha) - 0.5 * alpha^T Q alpha
#           Q_ij = y_i y_j x_i^T x_j
Q5 = np.outer(y5, y5) * (X5 @ X5.T)

def dual_obj5(alpha): return 0.5 * alpha @ Q5 @ alpha - alpha.sum()
def dual_grad5(alpha): return Q5 @ alpha - np.ones(n5)

# (d) Solve dual
cons5d = [{'type':'eq','fun': lambda a: a @ y5, 'jac': lambda a: y5}]
bounds5d = [(0, None)] * n5
a0 = np.ones(n5) * 0.01
res5d = minimize(dual_obj5, a0, jac=dual_grad5,
                  method='SLSQP', bounds=bounds5d, constraints=cons5d, tol=1e-12)
alpha5 = res5d.x
d_star5 = -res5d.fun  # dual maximises

# Recover w from dual
w5_dual = (alpha5 * y5) @ X5
sv_mask5 = alpha5 > 1e-3
b5_dual = np.mean(y5[sv_mask5] - X5[sv_mask5] @ w5_dual)

header('Exercise 5: SVM Primal-Dual')
print(f'(b) Primal: w={w5.round(4)}, b={b5:.4f}, p*={p_star5:.4f}')
print(f'(d) Dual:   d*={d_star5:.4f}')
gap5 = abs(p_star5 - d_star5)
check_close('(d) Duality gap = 0 (strong duality)', gap5, 0.0, tol=1e-4)

# (e) Support vectors via complementary slackness
margins5 = y5 * (X5 @ w5_dual + b5_dual)
print(f'\n(e) Support vector indices: {np.where(sv_mask5)[0]}')
print(f'    Margin values at SVs: {margins5[sv_mask5].round(4)}  (should be ~1)')
check_true('(e) SVs are on the margin (margin=1)', np.allclose(margins5[sv_mask5], 1.0, atol=0.1))

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(7, 6))
    ax.scatter(X_pos5[:,0], X_pos5[:,1], c='blue', label='+1', s=40)
    ax.scatter(X_neg5[:,0], X_neg5[:,1], c='red', label='-1', s=40)
    ax.scatter(X5[sv_mask5,0], X5[sv_mask5,1], s=150, facecolors='none',
               edgecolors='k', linewidths=2, label='Support vectors')
    xlim = ax.get_xlim()
    xline = np.array(xlim)
    ax.plot(xline, (-w5_dual[0]*xline - b5_dual)/w5_dual[1], 'k-', label='Decision boundary')
    ax.plot(xline, (-w5_dual[0]*xline - b5_dual + 1)/w5_dual[1], 'k--', alpha=0.5)
    ax.plot(xline, (-w5_dual[0]*xline - b5_dual - 1)/w5_dual[1], 'k--', alpha=0.5)
    ax.set_title('SVM: Hard Margin (KKT + Duality)')
    ax.legend(); plt.tight_layout(); plt.show()

print('\nTakeaway: SVM dual via KKT gives kernel trick;')
print('support vectors identified by complementary slackness alpha_i > 0.')

print("Exercise 5 solution complete.")

Exercise 6 (★★): Maximum Entropy Distribution

Find the maximum entropy distribution on 1,2,3,4,5\\{1, 2, 3, 4, 5\\} subject to: sumipi=1\\sum_i p_i = 1 and mathbbE[X]=mu0=3.2\\mathbb{E}[X] = \\mu_0 = 3.2.

(a) Write the Lagrangian with multipliers lambda0\\lambda_0 (normalisation) and lambda1\\lambda_1 (mean constraint).

(b) Derive that pi=exp(lambda0lambda1i)/Zp_i^* = \\exp(-\\lambda_0 - \\lambda_1 i) / Z where Z=sumiexp(lambda0lambda1i)Z = \\sum_i \\exp(-\\lambda_0 - \\lambda_1 i).

(c) Find lambda1\\lambda_1^* numerically by solving the mean constraint.

(d) Compute H(p)H(p^*) and compare to H(textuniform)H(\\text{uniform}). Show max-ent has higher entropy.

(e) Verify that pp^* is the softmax of the logits zi=lambda1iz_i = -\\lambda_1^* i: pi=textsoftmax(mathbfz)ip^*_i = \\text{softmax}(\\mathbf{z})_i.

Code cell 20

# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")

Code cell 21

# Solution
# Exercise 6 - reference solution

K6 = 5
items6 = np.arange(1, K6+1).astype(float)
mu0_6 = 3.2

# (b) Max-ent distribution
def max_ent_dist6(lam1):
    logp = -lam1 * items6
    logp -= logp.max()  # numerical stability
    p = np.exp(logp)
    return p / p.sum()

# (c) Solve mean constraint
from scipy.optimize import brentq
def mean_res6(lam1):
    return max_ent_dist6(lam1) @ items6 - mu0_6

lam1_star6 = brentq(mean_res6, -5, 5)
p_star6 = max_ent_dist6(lam1_star6)

def entropy6(p): return -np.sum(p * np.log(p + 1e-15))
p_uniform6 = np.ones(K6) / K6

header('Exercise 6: Maximum Entropy Distribution')
print(f'(c) lambda_1* = {lam1_star6:.6f}')
print(f'    p* = {p_star6.round(6)}')
check_close('(c) Mean constraint satisfied', p_star6 @ items6, mu0_6)
check_close('(c) Normalisation', p_star6.sum(), 1.0)

H_maxent = entropy6(p_star6)
H_uniform = entropy6(p_uniform6)
H_uniform_mean = entropy6(p_uniform6)  # uniform has mean=3, not 3.2
# Compare max-ent to closest uniform-ish dist with same mean
print(f'\n(d) H(p*) = {H_maxent:.6f}')
print(f'    H(uniform) = {H_uniform:.6f}  (mean=3.0, not {mu0_6})')
# Max-ent has highest entropy among all dist with given mean
# Any other dist with same mean has lower or equal entropy

# Try some alternative distributions with same mean
def entropy_of_dist_with_mean(p):
    if abs(p @ items6 - mu0_6) > 0.01: return -np.inf
    return entropy6(p)

test_dists = [
    np.array([0.1, 0.15, 0.2, 0.35, 0.2]),  # skewed
    np.array([0.05, 0.2, 0.3, 0.3, 0.15]),  # different shape
]
print('\n  Alternative distributions with mean ~3.2:')
for td in test_dists:
    if abs(sum(td) - 1) < 0.01 and abs(td @ items6 - mu0_6) < 0.1:
        print(f'  H={entropy6(td):.4f} (vs H(p*)={H_maxent:.4f})')
check_true('(d) max-ent has highest entropy', H_maxent >= entropy6(test_dists[0]) - 1e-4)

# (e) Softmax connection
z6 = -lam1_star6 * items6
softmax6 = np.exp(z6 - z6.max()) / np.exp(z6 - z6.max()).sum()
check_close('(e) p* = softmax(-lambda1* * items)', p_star6, softmax6)
print(f'\n(e) Logits z_i = -lambda1* * i = {z6.round(4)}')
print(f'    Softmax(z) = {softmax6.round(6)}')

print('\nTakeaway: Max-entropy solution is always a Boltzmann/softmax distribution.')
print('The Lagrange multiplier lambda_1* is the inverse temperature.')

print("Exercise 6 solution complete.")

Exercise 7 (★★★): SAM: KKT Analysis of the Inner Problem

Sharpness-Aware Minimisation (SAM) solves minmathbfwmaxboldsymbolepsilonleqrhomathcalL(mathbfw+boldsymbolepsilon)\\min_{\\mathbf{w}} \\max_{\\|\\boldsymbol{\\epsilon}\\| \\leq \\rho} \\mathcal{L}(\\mathbf{w} + \\boldsymbol{\\epsilon}).

(a) Write the inner maximisation maxboldsymbolepsilonleqrhomathcalL(mathbfw+boldsymbolepsilon)\\max_{\\|\\boldsymbol{\\epsilon}\\| \\leq \\rho} \\mathcal{L}(\\mathbf{w} + \\boldsymbol{\\epsilon}) as a constrained problem in standard form.

(b) Write the KKT conditions and solve analytically for boldsymbolepsilon\\boldsymbol{\\epsilon}^*. Show boldsymbolepsilon=rhonablamathcalL(mathbfw)/nablamathcalL(mathbfw)\\boldsymbol{\\epsilon}^* = \\rho \\nabla\\mathcal{L}(\\mathbf{w}) / \\|\\nabla\\mathcal{L}(\\mathbf{w})\\|.

(c) Implement the SAM update and verify boldsymbolepsilon\\boldsymbol{\\epsilon}^* satisfies KKT.

(d) For mathcalL(mathbfw)=(w11)2+10(w21)2\\mathcal{L}(\\mathbf{w}) = (w_1 - 1)^2 + 10(w_2 - 1)^2: compare the minima found by GD and SAM. Verify SAM finds a flatter minimum.

(e) Measure sharpness: compute lambdamax(H(mathbfw))\\lambda_{\\max}(H(\\mathbf{w}^*)) for both solutions.

Code cell 23

# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")

Code cell 24

# Solution
# Exercise 7 - reference solution

from scipy.optimize import minimize as spmin

rho7 = 0.1
eta7 = 0.005

# Sharp quadratic: min = (1,1), Hessian = diag(2, 20)
f7 = lambda w: (w[0]-1)**2 + 10*(w[1]-1)**2
grad_f7 = lambda w: np.array([2*(w[0]-1), 20*(w[1]-1)])
hess_f7 = lambda w: np.diag([2.0, 20.0])

# (b) Analytical eps_star
def eps_star_sam(w, f, rho):
    g = grad_fd(f, w)
    gnorm = la.norm(g)
    if gnorm < 1e-12: return np.zeros_like(w)
    return rho * g / gnorm

# Verify KKT at a test point
w_test = np.array([2.0, 2.0])
eps_kkt = eps_star_sam(w_test, f7, rho7)
g_test = grad_fd(f7, w_test)

header('Exercise 7: SAM KKT Analysis')
print(f'(b) At w={w_test}:')
print(f'    grad f = {g_test.round(4)}')
print(f'    eps*   = {eps_kkt.round(4)}')
print(f'    ||eps*|| = {la.norm(eps_kkt):.4f}  (should be {rho7})')
cos_angle = np.dot(eps_kkt, g_test) / (la.norm(eps_kkt)*la.norm(g_test))
check_close('(b) eps* parallel to grad (cos=1)', cos_angle, 1.0)
check_close('(b) ||eps*|| = rho', la.norm(eps_kkt), rho7)

# (c) Implement SAM
def sam_step7(w, f, rho, eta):
    eps = eps_star_sam(w, f, rho)
    g_perturbed = grad_fd(f, w + eps)
    return w - eta * g_perturbed

# (d) Compare GD and SAM
w0_7 = np.array([3.0, 3.0])
w_gd7 = w0_7.copy(); w_sam7 = w0_7.copy()

for _ in range(1000):
    g = grad_fd(f7, w_gd7)
    w_gd7 = w_gd7 - eta7 * g

for _ in range(1000):
    w_sam7 = sam_step7(w_sam7, f7, rho7, eta7)

print(f'\n(d) GD  final: w={w_gd7.round(4)}, f={f7(w_gd7):.2e}')
print(f'    SAM final: w={w_sam7.round(4)}, f={f7(w_sam7):.2e}')

# (e) Sharpness: lambda_max(H)
H_gd  = hessian_fd(f7, w_gd7)
H_sam = hessian_fd(f7, w_sam7)
lmax_gd  = la.eigvalsh(H_gd).max()
lmax_sam = la.eigvalsh(H_sam).max()

print(f'\n(e) Sharpness lambda_max(H):')
print(f'    GD:  {lmax_gd:.4f}')
print(f'    SAM: {lmax_sam:.4f}')
print(f'    Note: for a quadratic, both converge to (1,1);')
print(f'    SAM effect is more pronounced for non-quadratic losses')

check_true('(d) GD converges to minimum', la.norm(grad_fd(f7,w_gd7)) < 0.01)
check_true('(d) SAM converges to minimum', la.norm(grad_fd(f7,w_sam7)) < 0.05)

print('\nTakeaway: SAM eps* = rho * grad/||grad|| solves the KKT of the inner max.')
print('SAM prefers flatter regions by ascending to worst-case neighbour before descending.')

print("Exercise 7 solution complete.")

Exercise 8 (★★★): RLHF Optimal Policy and DPO

Derive and analyse the RLHF optimal policy via KKT conditions.

(a) Formulate RLHF as: \\max_\\pi \\mathbb{E}[r(\\mathbf{y})] subject to DtextKL(pipitextref)leqdeltaD_{\\text{KL}}(\\pi \\| \\pi_{\\text{ref}}) \\leq \\delta.

(b) Write the Lagrangian with multiplier beta\\beta. Derive the KKT stationarity condition and show the optimal policy is pi(y)proptopitextref(y)exp(r(y)/beta)\\pi^*(y) \\propto \\pi_{\\text{ref}}(y) \\exp(r(y)/\\beta).

(c) For a discrete outcome space with K=6K=6 responses, implement the optimal policy as a function of beta\\beta. Show how beta\\beta interpolates between pure reward maximisation (betato0\\beta \\to 0) and reference policy (betatoinfty\\beta \\to \\infty).

(d) DPO reparameterisation: given a trained model \\pi_\\theta, recover the implicit reward \\hat{r}(y) = \\beta \\log(\\pi_\\theta(y)/\\pi_{\\text{ref}}(y)). Verify it ranks responses correctly.

(e) Compute the KL divergence DtextKL(pipitextref)D_{\\text{KL}}(\\pi^* \\| \\pi_{\\text{ref}}) as a function of beta\\beta and verify the envelope: dE[r]/ddelta=1/betadE[r]/d\\delta = 1/\\beta^*.

Code cell 26

# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")

Code cell 27

# Solution
# Exercise 8 - reference solution

K8 = 6
np.random.seed(42)

pi_ref8 = np.array([0.05, 0.20, 0.30, 0.25, 0.15, 0.05])
pi_ref8 /= pi_ref8.sum()
r8 = np.array([0.2, 0.5, 0.8, 1.5, 2.0, 0.3])

def kl_div8(p, q):
    return np.sum(p * (np.log(p + 1e-15) - np.log(q + 1e-15)))

# (b) KKT: stationarity gives pi*(y) ∝ pi_ref(y) * exp(r(y)/beta)
def rlhf_policy8(pi_ref, r, beta):
    log_pi = np.log(pi_ref + 1e-15) + r / beta
    log_pi -= log_pi.max()  # stability
    pi = np.exp(log_pi)
    return pi / pi.sum()

header('Exercise 8: RLHF Optimal Policy and DPO')

# (c) Interpolation with beta
betas8 = [0.01, 0.1, 0.5, 1.0, 5.0, 100.0]
print('(c) Policy interpolation with beta:')
for beta in betas8:
    pi_s = rlhf_policy8(pi_ref8, r8, beta)
    er = pi_s @ r8
    kl = kl_div8(pi_s, pi_ref8)
    best = pi_s.argmax()
    print(f'  beta={beta:.2f}: E[r]={er:.3f}, KL={kl:.3f}, argmax={best}')

check_true('(c) small beta -> high reward (argmax=4)', rlhf_policy8(pi_ref8, r8, 0.01).argmax() == 4)
check_true('(c) large beta -> close to ref', kl_div8(rlhf_policy8(pi_ref8, r8, 100), pi_ref8) < 0.01)

# (d) DPO: implicit reward recovery
beta_dpo8 = 1.0
pi_trained8 = rlhf_policy8(pi_ref8, r8, beta_dpo8)

# DPO implicit reward
r_implicit = beta_dpo8 * (np.log(pi_trained8 + 1e-15) - np.log(pi_ref8 + 1e-15))

print(f'\n(d) True reward:     {r8}')
print(f'    Implicit reward: {r_implicit.round(4)}')

# Verify ranking
rank_true = np.argsort(r8)[::-1]
rank_impl = np.argsort(r_implicit)[::-1]
check_true('(d) Rankings match', np.array_equal(rank_true, rank_impl))
print(f'    True rank: {rank_true}')
print(f'    Impl rank: {rank_impl}')

# (e) KL vs beta; envelope d E[r]/d delta = 1/beta*
beta_grid = np.logspace(-1, 2, 50)
er_grid = [rlhf_policy8(pi_ref8, r8, b) @ r8 for b in beta_grid]
kl_grid = [kl_div8(rlhf_policy8(pi_ref8, r8, b), pi_ref8) for b in beta_grid]

print('\n(e) KL constraint vs reward trade-off:')
for i, (beta, er, kl) in enumerate(zip(beta_grid, er_grid, kl_grid)):
    if i % 10 == 0:
        print(f'  beta={beta:.2f}: E[r]={er:.4f}, KL={kl:.4f}, 1/beta={1/beta:.4f}')

# Numerical derivative d E[r]/d delta
# At beta=1, find corresponding KL, check d E[r]/d(KL) ~ 1/beta
beta_check = 1.0
eps_kl = 0.001
# Find betas such that KL = kl0 and KL = kl0+eps
kl0 = kl_div8(rlhf_policy8(pi_ref8, r8, beta_check), pi_ref8)
er0 = rlhf_policy8(pi_ref8, r8, beta_check) @ r8
print(f'\nAt beta={beta_check}: KL={kl0:.4f}, E[r]={er0:.4f}, 1/beta={1/beta_check:.4f}')
print('Shadow price interpretation: 1/beta = value of relaxing KL constraint by 1 unit')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    axes[0].semilogx(beta_grid, er_grid, 'b-', linewidth=2)
    axes[0].set_xlabel('beta'); axes[0].set_ylabel('E[r]')
    axes[0].set_title('Expected Reward vs Beta')
    axes[0].grid(True, alpha=0.4)

    axes[1].plot(kl_grid, er_grid, 'r-', linewidth=2)
    axes[1].set_xlabel('KL divergence'); axes[1].set_ylabel('E[r]')
    axes[1].set_title('Reward-KL Trade-off Frontier')
    axes[1].grid(True, alpha=0.4)
    plt.suptitle('RLHF: Lagrange multiplier beta controls reward-KL trade-off', fontsize=12)
    plt.tight_layout(); plt.show()

print('\nTakeaway: RLHF optimal policy is KKT Boltzmann distribution.')
print('Beta is the Lagrange multiplier for the KL constraint (shadow price = 1/beta).')
print('DPO recovers reward ranking from log-ratio of trained to reference policy.')

print("Exercise 8 solution complete.")

Exercise 9 (★★★): KKT Conditions for Projection onto the Simplex

Project uRnu\in\mathbb{R}^n onto the probability simplex

Δ={x:xi0, ixi=1}.\Delta=\{x: x_i\ge 0,\ \sum_i x_i=1\}.

Implement the sorting-based projection and verify primal feasibility plus KKT complementarity.

Code cell 29

# Your Solution
# Exercise 9 - learner workspace
# Implement simplex projection and verify KKT conditions.
print("Learner workspace ready for Exercise 9.")

Code cell 30

# Solution
# Exercise 9 - simplex projection KKT
header("Exercise 9: simplex projection")

u = np.array([0.4, -0.2, 1.7, 0.1, 0.9])

def project_simplex(u):
    v = np.sort(u)[::-1]
    cssv = np.cumsum(v)
    rho = np.nonzero(v * np.arange(1, len(u)+1) > (cssv - 1))[0][-1]
    theta = (cssv[rho] - 1) / (rho + 1)
    return np.maximum(u - theta, 0), theta

x, theta = project_simplex(u)
stationarity_residual = x - u + theta
active = x > 1e-10
print("projected x:", x)
print("sum x:", x.sum(), "theta:", theta)
check_close("simplex sum", x.sum(), 1.0, tol=1e-10)
check_true("nonnegative", np.all(x >= -1e-12))
check_close("stationarity on active set", stationarity_residual[active], 0.0, tol=1e-10)
check_true("inactive coordinates satisfy u_i - theta <= 0", np.all(u[~active] - theta <= 1e-10))
print("Takeaway: many probability constraints reduce to KKT plus a threshold.")

Exercise 10 (★★★): Entropy-Regularized Linear Objective

Solve

maxpΔ rp+τH(p),H(p)=ipilogpi.\max_{p\in\Delta}\ r^\top p + \tau H(p), \qquad H(p)=-\sum_i p_i\log p_i.

Derive the softmax solution and verify that it satisfies the simplex constraint and improves the objective.

Code cell 32

# Your Solution
# Exercise 10 - learner workspace
# Derive the entropy-regularized optimum and compare objectives.
print("Learner workspace ready for Exercise 10.")

Code cell 33

# Solution
# Exercise 10 - entropy-regularized optimum
header("Exercise 10: entropy-regularized simplex optimum")

r = np.array([0.2, 1.0, -0.4, 0.7])
tau = 0.35

def entropy(p):
    return -np.sum(p * np.log(p + 1e-12))

def objective(p):
    return r @ p + tau * entropy(p)

p_star = softmax(r / tau)
uniform = np.ones_like(r) / len(r)
print("p_star:", p_star)
print("objective(p_star)=", objective(p_star))
print("objective(uniform)=", objective(uniform))
check_close("simplex sum", p_star.sum(), 1.0, tol=1e-12)
check_true("positive probabilities", np.all(p_star > 0))
check_true("softmax optimum beats uniform", objective(p_star) > objective(uniform))
print("Takeaway: softmax is the KKT solution for entropy-regularized choice.")

What to Review After Finishing

  • Exercise 1: Can you classify critical points in mathbbRn\\mathbb{R}^n for n>2n > 2?
  • Exercise 2: Do you understand the shadow price as dp/dbdp^*/db?
  • Exercise 3: Can you write all 4 KKT conditions from memory?
  • Exercise 4: Can you apply all convexity preservation rules?
  • Exercise 5: Can you derive the SVM dual from the Lagrangian?
  • Exercise 6: Do you understand softmax as max-entropy solution?
  • Exercise 7: Can you derive the SAM update from KKT first principles?
  • Exercise 8: Do you understand why beta is the Lagrange multiplier in RLHF?

References

  1. Boyd & Vandenberghe, Convex Optimization (2004) — Chapters 4-5, KKT
  2. Foret et al., Sharpness-Aware Minimization (2021) — SAM derivation
  3. Rafailov et al., Direct Preference Optimization (2023) — DPO from KKT
  4. Nocedal & Wright, Numerical Optimization (2006) — Constrained theory