Exercises NotebookMath for LLMs

Partial Derivatives and Gradients

Multivariate Calculus / Partial Derivatives and Gradients

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Partial Derivatives and Gradients - Exercises

10 graded exercises covering the full section arc, from multivariate calculus mechanics to ML-facing gradient systems.

FormatDescription
ProblemMarkdown cell with task description
Your SolutionCode cell for learner work
SolutionReference solution with checks

Difficulty: straightforward -> moderate -> challenging.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize

try:
    import torch
    HAS_TORCH = True
except ImportError:
    torch = None
    HAS_TORCH = False


def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok

def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

def sigmoid(x):
    x = np.asarray(x, dtype=float)
    return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z, axis=axis, keepdims=True)
    e = np.exp(z)
    return e / np.sum(e, axis=axis, keepdims=True)

def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return np.where(np.asarray(x) > 0, 1.0, 0.0)

def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def numerical_gradient(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        xp = x.copy(); xm = x.copy()
        xp[idx] += h; xm[idx] -= h
        grad[idx] = (f(xp) - f(xm)) / (2*h)
    return grad

def numerical_jacobian(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
    return J.reshape(y0.shape + x.shape)

def grad_check(f, x, analytic_grad, h=1e-6):
    numeric_grad = numerical_gradient(f, x, h=h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def jacobian_fd(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
        ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
        J[:, j] = (yp - ym) / (2*h)
    return J.reshape(y0.shape + x.shape)

def hessian_fd(f, x, h=1e-5):
    x = np.asarray(x, dtype=float)
    H = np.zeros((x.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
        gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
        H[:, j] = (gp - gm) / (2*h)
    return H.reshape(x.shape + x.shape)



def grad_fd(f, x, h=1e-6):
    return numerical_gradient(f, x, h=h)



def fd_grad(f, x, h=1e-6):
    return numerical_gradient(f, np.asarray(x, dtype=float), h=h)

print("Chapter helper setup complete.")

Exercise 1 (★): Partial Derivatives: Computation and Interpretation

Compute all first-order partial derivatives of the following functions and evaluate them at the given points. Verify numerically using centered finite differences.

(a) f(x,y)=x3y+exyf(x, y) = x^3 y + e^{xy}. Compute f/x\partial f/\partial x and f/y\partial f/\partial y analytically, then evaluate at (x,y)=(1,2)(x, y) = (1, 2).

(b) g(x,y,z)=sin(xy)+yz2ln(x+z)g(x, y, z) = \sin(xy) + yz^2 - \ln(x+z). Compute all three partials at (x,y,z)=(1,0,1)(x,y,z) = (1, 0, 1).

(c) For h(w)=w2=iwi2h(\mathbf{w}) = \|\mathbf{w}\|^2 = \sum_i w_i^2, compute h/wj\partial h/\partial w_j analytically. Verify: h(w)=2w\nabla h(\mathbf{w}) = 2\mathbf{w}.

(d) Compute the second-order mixed partial 2f/xy\partial^2 f/\partial x\,\partial y for ff from part (a) and verify Clairaut's theorem: 2f/xy=2f/yx\partial^2 f/\partial x\,\partial y = \partial^2 f/\partial y\,\partial x.

Code cell 5

# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")

Code cell 6

# Solution
# Exercise 1 - reference solution

# (a) f(x, y) = x^3*y + exp(xy)
# df/dx = 3x^2*y + y*exp(xy)
# df/dy = x^3 + x*exp(xy)
def df_dx(x, y): return 3*x**2*y + y*np.exp(x*y)
def df_dy(x, y): return x**3 + x*np.exp(x*y)

f = lambda xy: xy[0]**3*xy[1] + np.exp(xy[0]*xy[1])
x0, y0 = 1.0, 2.0
num_dx = (f(np.array([x0+1e-5, y0])) - f(np.array([x0-1e-5, y0]))) / 2e-5
num_dy = (f(np.array([x0, y0+1e-5])) - f(np.array([x0, y0-1e-5]))) / 2e-5

header('Exercise 1: Partial Derivatives')
check_close('df/dx at (1,2)', df_dx(x0, y0), num_dx)
check_close('df/dy at (1,2)', df_dy(x0, y0), num_dy)

# (b) g(x,y,z) = sin(xy) + y*z^2 - ln(x+z)
# dg/dx = y*cos(xy) - 1/(x+z)
# dg/dy = x*cos(xy) + z^2
# dg/dz = 2yz - 1/(x+z)
def dg_dx(x,y,z): return y*np.cos(x*y) - 1/(x+z)
def dg_dy(x,y,z): return x*np.cos(x*y) + z**2
def dg_dz(x,y,z): return 2*y*z - 1/(x+z)

g = lambda xyz: np.sin(xyz[0]*xyz[1]) + xyz[1]*xyz[2]**2 - np.log(xyz[0]+xyz[2])
p = np.array([1., 0., 1.])
ng = numerical_gradient(g, p)
check_close('dg/dx at (1,0,1)', dg_dx(*p), ng[0])
check_close('dg/dy at (1,0,1)', dg_dy(*p), ng[1])
check_close('dg/dz at (1,0,1)', dg_dz(*p), ng[2])

# (c) grad ||w||^2 = 2w
def grad_norm_sq(w): return 2 * w
w = np.array([1., 2., 3.])
h_fn = lambda w: np.dot(w, w)
check_close('grad ||w||^2', grad_norm_sq(w), numerical_gradient(h_fn, w))

# (d) Clairaut's theorem: d^2f/dxdy = d^2f/dydx
# d^2f/dxdy = 3x^2 + exp(xy) + xy*exp(xy)
h = 1e-4
dfdxy = (f(np.array([x0+h, y0+h])) - f(np.array([x0+h, y0-h]))
         - f(np.array([x0-h, y0+h])) + f(np.array([x0-h, y0-h]))) / (4*h**2)
exact = 3*x0**2 + np.exp(x0*y0) + x0*y0*np.exp(x0*y0)
check_close('Clairaut d2f/dxdy = d2f/dydx', dfdxy, exact, tol=1e-4)
print('\nTakeaway: Centered finite differences achieve O(h^2) accuracy; '
      'Clairaut\'s theorem holds when partials are continuous.')

print("Exercise 1 solution complete.")

Exercise 2 (★): Gradient Geometry

Let f(x,y)=x2+xy+y2f(x, y) = x^2 + xy + y^2.

(a) Compute f(x,y)\nabla f(x, y) analytically.

(b) At the point p=(1,1)\mathbf{p} = (1, 1), find the gradient f(p)\nabla f(\mathbf{p}) and its magnitude f(p)\|\nabla f(\mathbf{p})\|.

(c) Show numerically that the gradient is perpendicular to the level curve f1(3)f^{-1}(3) at p\mathbf{p}. (Hint: parametrize the level curve near p\mathbf{p} and compute the dot product with the tangent direction.)

(d) Find the unit vector u^\hat{\mathbf{u}} that minimises Du^f(p)D_{\hat{\mathbf{u}}} f(\mathbf{p}) (steepest descent direction) and verify Du^f=fD_{\hat{\mathbf{u}}} f = -\|\nabla f\|.

Code cell 8

# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")

Code cell 9

# Solution
# Exercise 2 - reference solution

f2 = lambda xy: xy[0]**2 + xy[0]*xy[1] + xy[1]**2

def grad_f(x, y):
    return np.array([2*x + y, x + 2*y])

def steepest_descent_dir(x, y):
    g = grad_f(x, y)
    return -g / la.norm(g)

p = np.array([1.0, 1.0])
g = grad_f(*p)

header('Exercise 2: Gradient Geometry')
check_close('gradient at (1,1)', g, numerical_gradient(f2, p))
check_close('gradient magnitude', la.norm(g), np.sqrt(9 + 9)**.5 * np.sqrt(2))
# Correction: ||g|| = ||(3,3)|| = 3*sqrt(2)
check_close('gradient magnitude', la.norm(g), 3*np.sqrt(2))

# (c) Level curve f=3 near p=(1,1): parametrise as (1+dt, y(dt)) where y satisfies f=3
# Approximate tangent: t = (-df/dy, df/dx) / norm (perpendicular to grad)
tangent = np.array([-g[1], g[0]])  # rotate grad by 90 degrees
tangent /= la.norm(tangent)
dot = np.dot(g, tangent)
check_close('grad ⊥ level curve tangent (dot=0)', dot, 0.0, tol=1e-10)

# (d) Steepest descent
u = steepest_descent_dir(*p)
D_u = np.dot(g, u)
check_close('directional deriv in steepest descent = -||grad||', D_u, -la.norm(g))
check_close('u is unit vector', la.norm(u), 1.0)
print('\nTakeaway: The gradient points perpendicular to level curves; '
      'steepest descent is exactly -∇f/‖∇f‖.')

print("Exercise 2 solution complete.")

Exercise 3 (★): Directional Derivatives

Let f(x,y)=ex2+y2f(x, y) = e^{x^2 + y^2} (a Gaussian bump centred at the origin).

(a) Compute Du^f(p)D_{\hat{\mathbf{u}}} f(\mathbf{p}) at p=(1,0)\mathbf{p} = (1, 0) for u^=(cosθ,sinθ)\hat{\mathbf{u}} = (\cos\theta, \sin\theta) as a function of θ\theta. Plot it for θ[0,2π]\theta \in [0, 2\pi].

(b) Find the value of θ\theta that maximises Du^f(p)D_{\hat{\mathbf{u}}} f(\mathbf{p}) and the value that minimises it.

(c) Find all directions u^\hat{\mathbf{u}} such that Du^f(p)=0D_{\hat{\mathbf{u}}} f(\mathbf{p}) = 0. These are the neutral directions (tangent to the level set).

(d) Verify numerically that the directional derivative formula Du^f=fu^D_{\hat{\mathbf{u}}} f = \nabla f \cdot \hat{\mathbf{u}} matches the limit definition (f(p+hu^)f(p))/h(f(\mathbf{p} + h\hat{\mathbf{u}}) - f(\mathbf{p})) / h as h0h \to 0.

Code cell 11

# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")

Code cell 12

# Solution
# Exercise 3 - reference solution

f3 = lambda xy: np.exp(xy[0]**2 + xy[1]**2)
p3 = np.array([1.0, 0.0])

# Gradient of f at p: [2x*exp(x^2+y^2), 2y*exp(x^2+y^2)]
def grad_f3(p):
    val = np.exp(p[0]**2 + p[1]**2)
    return np.array([2*p[0]*val, 2*p[1]*val])

def directional_deriv(p, theta):
    u = np.array([np.cos(theta), np.sin(theta)])
    return np.dot(grad_f3(p), u)

g3 = grad_f3(p3)
thetas = np.linspace(0, 2*np.pi, 1000)
Dvs = np.array([directional_deriv(p3, t) for t in thetas])

header('Exercise 3: Directional Derivatives')

# (a)-(b) max/min
theta_max = thetas[np.argmax(Dvs)]
theta_min = thetas[np.argmin(Dvs)]
check_close('max D_u f = ||grad||', Dvs.max(), la.norm(g3), tol=1e-3)
check_close('min D_u f = -||grad||', Dvs.min(), -la.norm(g3), tol=1e-3)
check_close('theta_max ≈ 0 (gradient direction)', theta_max, 0.0, tol=0.01)

# (c) Neutral directions: theta where D_u f = 0
neutral_thetas = [np.pi/2, 3*np.pi/2]  # perpendicular to gradient
for t in neutral_thetas:
    check_close(f'D_u f=0 at theta={t:.3f}', directional_deriv(p3, t), 0.0, tol=1e-10)

# (d) Limit definition vs formula
theta_test = 0.7
u = np.array([np.cos(theta_test), np.sin(theta_test)])
for h in [1e-2, 1e-4, 1e-6]:
    limit_approx = (f3(p3 + h*u) - f3(p3)) / h
    formula = np.dot(g3, u)
    err = abs(limit_approx - formula)
    print(f'h={h:.0e}: limit={limit_approx:.6f}, formula={formula:.6f}, error={err:.2e}')

print('\nTakeaway: D_u f = ∇f·u maximised (=‖∇f‖) along ∇f; '
      'zero in the two perpendicular directions tangent to the level set.')

print("Exercise 3 solution complete.")

Exercise 4 (★★): Matrix Calculus from First Principles

Prove the following gradient identities and verify numerically at a random point.

(a) For f(x)=axf(\mathbf{x}) = \mathbf{a}^\top \mathbf{x} (linear function), prove f=a\nabla f = \mathbf{a}.

(b) For f(x)=xAxf(\mathbf{x}) = \mathbf{x}^\top A \mathbf{x} (quadratic form) with symmetric AA, prove f=2Ax\nabla f = 2A\mathbf{x}.

(c) For f(x)=Axb2f(\mathbf{x}) = \|A\mathbf{x} - \mathbf{b}\|^2 (least squares objective), prove f=2A(Axb)\nabla f = 2A^\top(A\mathbf{x} - \mathbf{b}).

(d) For f(X)=tr(AX)f(X) = \text{tr}(AX) (trace function, XRn×nX \in \mathbb{R}^{n\times n}), show that f/Xij=Aji\partial f / \partial X_{ij} = A_{ji} so Xf=A\nabla_X f = A^\top.

Code cell 14

# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")

Code cell 15

# Solution
# Exercise 4 - reference solution

import numpy as np
np.random.seed(42)

def numerical_gradient(f, x, h=1e-5):
    grad = np.zeros_like(x, dtype=float)
    for i in range(len(x)):
        ei = np.zeros_like(x, dtype=float); ei[i] = 1.0
        grad[i] = (f(x + h*ei) - f(x - h*ei)) / (2*h)
    return grad

def check_close(name, got, expected, tol=1e-5):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'}{name}")
    if not ok: print('  exp:', expected, '\n  got:', got)

def grad_linear(a, x): return a
def grad_quadratic(A, x): return 2 * A @ x
def grad_least_squares(A, b, x): return 2 * A.T @ (A @ x - b)

n = 4
a = np.random.randn(n)
A = np.random.randn(n, n); A = A + A.T
b = np.random.randn(n)
x = np.random.randn(n)

print('=' * 50)
print('Exercise 4: Matrix Calculus')
print('=' * 50)

# (a) grad(a^T x) = a
f_lin = lambda x: np.dot(a, x)
check_close('grad(a^Tx) = a', grad_linear(a, x), numerical_gradient(f_lin, x))

# (b) grad(x^T A x) = 2Ax
f_quad = lambda x: x @ A @ x
check_close('grad(x^TAx) = 2Ax', grad_quadratic(A, x), numerical_gradient(f_quad, x))

# (c) grad(||Ax-b||^2) = 2A^T(Ax-b)
f_ls = lambda x: np.sum((A @ x - b)**2)
check_close('grad(||Ax-b||^2)', grad_least_squares(A, b, x), numerical_gradient(f_ls, x))

# (d) grad_X tr(AX) = A^T
n2 = 3
A2 = np.random.randn(n2, n2)
X2 = np.random.randn(n2, n2)
f_tr = lambda X_flat: np.trace(A2 @ X_flat.reshape(n2, n2))
num_grad = numerical_gradient(f_tr, X2.flatten()).reshape(n2, n2)
check_close('grad_X tr(AX) = A^T', num_grad, A2.T)
print('\nTakeaway: All ML loss gradients reduce to these three matrix calculus identities; '
      'proving them rigorously via index notation builds the foundation for backprop.')

print("Exercise 4 solution complete.")

Exercise 5 (★★): MSE Gradient and Linear Regression

The mean squared error for linear regression is:

L(w)=1mXwy2=1mi=1m(xiwyi)2\mathcal{L}(\mathbf{w}) = \frac{1}{m}\|X\mathbf{w} - \mathbf{y}\|^2 = \frac{1}{m}\sum_{i=1}^m (\mathbf{x}_i^\top \mathbf{w} - y_i)^2

(a) Derive wL=2mX(Xwy)\nabla_\mathbf{w} \mathcal{L} = \frac{2}{m} X^\top(X\mathbf{w} - \mathbf{y}).

(b) Implement gradient descent and run it on a synthetic dataset with m=200m=200 samples, n=5n=5 features, true weights w=[1,2,0.5,3,1]\mathbf{w}^* = [1, -2, 0.5, 3, -1]^\top, noise σ=0.1\sigma=0.1.

(c) Verify that the GD solution converges to the OLS solution wOLS=(XX)1Xy\mathbf{w}_{\text{OLS}} = (X^\top X)^{-1} X^\top \mathbf{y}.

(d) Plot the training loss curve. How many steps does it take to reach within 10410^{-4} of L\mathcal{L}^*?

Code cell 17

# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")

Code cell 18

# Solution
# Exercise 5 - reference solution

import numpy as np
np.random.seed(42)

def mse_grad(X, y, w):
    m = len(y)
    return (2 / m) * X.T @ (X @ w - y)

def gradient_descent_lr(X, y, w0, lr, n_steps):
    m = len(y)
    w = w0.copy()
    losses = []
    for _ in range(n_steps):
        loss = np.mean((X @ w - y)**2)
        losses.append(loss)
        w -= lr * mse_grad(X, y, w)
    losses.append(np.mean((X @ w - y)**2))
    return w, losses

m, n = 200, 5
w_true = np.array([1., -2., 0.5, 3., -1.])
X = np.random.randn(m, n)
y = X @ w_true + 0.1 * np.random.randn(m)

w0 = np.zeros(n)
w_gd, losses = gradient_descent_lr(X, y, w0, lr=0.1, n_steps=300)

# OLS solution
w_ols = np.linalg.solve(X.T @ X, X.T @ y)
loss_star = np.mean((X @ w_ols - y)**2)

print('=' * 50)
print('Exercise 5: MSE Gradient and Linear Regression')
print('=' * 50)

def check_close(name, got, expected, tol=1e-3):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'}{name}")

check_close('GD converges to OLS solution', w_gd, w_ols, tol=1e-3)
check_close('GD converges to true weights', w_gd, w_true, tol=0.05)

# Step to 1e-4 above optimal
losses_arr = np.array(losses)
target = loss_star + 1e-4
steps_needed = np.searchsorted(-(losses_arr - loss_star), -1e-4)
print(f'Steps to reach L* + 1e-4: {steps_needed}')
print(f'Final loss: {losses[-1]:.6f}, OLS loss: {loss_star:.6f}')

try:
    import matplotlib.pyplot as plt
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.semilogy(losses_arr - loss_star + 1e-10, color='#2166AC', lw=2)
    ax.axhline(1e-4, color='#D6604D', ls='--', lw=1.5, label='Target: L*+1e-4')
    ax.set_xlabel('Step'); ax.set_ylabel('L(w) - L* (log)')
    ax.set_title('MSE gradient descent convergence')
    ax.legend()
    plt.tight_layout()
    plt.savefig('/tmp/mse_gd.png', dpi=100, bbox_inches='tight')
    plt.show()
except ImportError:
    pass
print('\nTakeaway: MSE gradient descent converges to the OLS solution; '
      'convergence rate ∝ 1 - η*λ_max(X^TX)/m, so conditioning matters.')

print("Exercise 5 solution complete.")

Exercise 6 (★★): Gradient Checking

Implement a general-purpose gradient checker and use it to debug three functions: one with a correct gradient, one with a sign error, and one with a scaling error.

(a) Implement gradient_check(f, grad_f, x, h=1e-5) that returns the relative error:

rel_error=g^gg^+g+ε\text{rel\_error} = \frac{\|\hat{g} - g\|}{\|\hat{g}\| + \|g\| + \varepsilon}

where g^\hat{g} is the numerical gradient and g=f(x)g = \nabla f(x) is the provided gradient.

(b) Test on:

  • Correct: f(x)=x2f(\mathbf{x}) = \|\mathbf{x}\|^2, f=2x\nabla f = 2\mathbf{x}
  • Sign error: f=2x\nabla f = -2\mathbf{x} (wrong sign)
  • Scaling error: f=x\nabla f = \mathbf{x} (missing factor of 2)

(c) Apply the checker to a two-layer neural network's weight gradient: f(W)=σ(XW)vy2f(W) = \|\sigma(XW)\mathbf{v} - \mathbf{y}\|^2 with σ=tanh\sigma = \tanh. Implement Wf\nabla_W f and verify it passes the gradient check (rel_error <105< 10^{-5}).

Code cell 20

# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")

Code cell 21

# Solution
# Exercise 6 - reference solution

import numpy as np
np.random.seed(42)

def numerical_gradient(f, x, h=1e-5):
    grad = np.zeros_like(x, dtype=float)
    for i in range(len(x)):
        ei = np.zeros_like(x, dtype=float); ei[i] = 1.0
        grad[i] = (f(x + h*ei) - f(x - h*ei)) / (2*h)
    return grad

def gradient_check(f, grad_f_val, x, h=1e-5, eps=1e-8):
    num_grad = numerical_gradient(f, x, h)
    analytic = np.asarray(grad_f_val, dtype=float)
    diff = np.linalg.norm(num_grad - analytic)
    denom = np.linalg.norm(num_grad) + np.linalg.norm(analytic) + eps
    return diff / denom

print('=' * 55)
print('Exercise 6: Gradient Checking')
print('=' * 55)

x_test = np.random.randn(5)
f_sq = lambda x: np.dot(x, x)

def check_true(name, cond):
    print(f"{'PASS' if cond else 'FAIL'}{name}")

err_correct = gradient_check(f_sq, 2*x_test, x_test)
err_sign    = gradient_check(f_sq, -2*x_test, x_test)
err_scale   = gradient_check(f_sq, x_test, x_test)

print(f'Correct (2x):     rel_error = {err_correct:.2e}')
print(f'Sign error (-2x): rel_error = {err_sign:.2e}')
print(f'Scale error (x):  rel_error = {err_scale:.2e}')

check_true('correct gradient passes  (err < 1e-5)', err_correct < 1e-5)
check_true('sign error fails   (err > 1e-1)', err_sign > 1e-1)
check_true('scale error fails  (err > 1e-1)', err_scale > 1e-1)

# (c) Two-layer neural network gradient
np.random.seed(0)
m_nn, d_in, d_h = 20, 4, 6
X_nn = np.random.randn(m_nn, d_in)
v = np.random.randn(d_h)
y_nn = np.random.randn(m_nn)
W = np.random.randn(d_in, d_h) * 0.1

def f_nn(W_flat):
    W_ = W_flat.reshape(d_in, d_h)
    H = np.tanh(X_nn @ W_)
    return np.sum((H @ v - y_nn)**2)

# Analytic gradient:
# dL/dW = X^T [ 2*(H@v - y) * v^T * (1 - H^2) ]
def grad_nn(W):
    H = np.tanh(X_nn @ W)
    delta = 2 * (H @ v - y_nn)          # (m,)
    dH = delta[:, None] * v[None, :]    # (m, d_h)
    dW = X_nn.T @ (dH * (1 - H**2))    # (d_in, d_h)
    return dW

g_analytic = grad_nn(W)
err_nn = gradient_check(f_nn, g_analytic.flatten(), W.flatten())
print(f'\n2-layer network gradient: rel_error = {err_nn:.2e}')
check_true('network gradient passes check (err < 1e-5)', err_nn < 1e-5)
print('\nTakeaway: Gradient checking is indispensable when implementing backprop; '
      'rel_error < 1e-5 is the gold standard for floating-point correctness.')

print("Exercise 6 solution complete.")

Exercise 7 (★★★): Softmax Gradient and Cross-Entropy

The softmax function σ:RKΔK1\sigma : \mathbb{R}^K \to \Delta^{K-1} maps logits to probabilities:

σ(z)k=ezkj=1Kezj\sigma(\mathbf{z})_k = \frac{e^{z_k}}{\sum_{j=1}^K e^{z_j}}

(a) Derive the Jacobian of softmax: σkzj=σk(δkjσj)\frac{\partial \sigma_k}{\partial z_j} = \sigma_k(\delta_{kj} - \sigma_j). Express in matrix form: Jσ=diag(p)ppJ_\sigma = \text{diag}(\mathbf{p}) - \mathbf{p}\mathbf{p}^\top.

(b) The cross-entropy loss for true label y{1,,K}y \in \{1,\ldots,K\} is L=logσ(z)y=zy+logjezj\mathcal{L} = -\log \sigma(\mathbf{z})_y = -z_y + \log\sum_j e^{z_j}. Show zL=pey\nabla_\mathbf{z} \mathcal{L} = \mathbf{p} - \mathbf{e}_y (where ey\mathbf{e}_y is the one-hot vector).

(c) Implement stable softmax (using the log-sum-exp trick) and its gradient, then verify with gradient checking.

(d) For a batch of mm examples with one-hot matrix Y{0,1}m×KY \in \{0,1\}^{m\times K} and logit matrix ZRm×KZ \in \mathbb{R}^{m\times K}, show ZLbatch=1m(PY)\nabla_Z \mathcal{L}_{\text{batch}} = \frac{1}{m}(P - Y) where Pij=σ(Zi)jP_{ij} = \sigma(Z_i)_j.

Code cell 23

# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")

Code cell 24

# Solution
# Exercise 7 - reference solution

import numpy as np
np.random.seed(42)

def numerical_gradient(f, x, h=1e-5):
    grad = np.zeros_like(x, dtype=float)
    for i in range(len(x)):
        ei = np.zeros_like(x, dtype=float); ei[i] = 1.0
        grad[i] = (f(x + h*ei) - f(x - h*ei)) / (2*h)
    return grad

def check_close(name, got, expected, tol=1e-5):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'}{name}")
    if not ok: print('  exp:', expected, '\n  got:', got)

def softmax(z):
    z_shifted = z - z.max()  # numerical stability
    e = np.exp(z_shifted)
    return e / e.sum()

def softmax_jacobian(z):
    p = softmax(z)
    return np.diag(p) - np.outer(p, p)

def ce_loss(z, y_true):
    p = softmax(z)
    return -np.log(p[y_true] + 1e-15)

def ce_grad(z, y_true):
    p = softmax(z)
    g = p.copy()
    g[y_true] -= 1.0
    return g

print('=' * 55)
print('Exercise 7: Softmax Gradient and Cross-Entropy')
print('=' * 55)

z = np.array([2.0, 1.0, 0.1])
y = 0
p = softmax(z)

# (a) Jacobian
J = softmax_jacobian(z)
J_expected = np.diag(p) - np.outer(p, p)
check_close('softmax Jacobian = diag(p) - pp^T', J, J_expected)

# Verify Jacobian via numerical diff
J_num = np.zeros((3, 3))
for j in range(3):
    ej = np.eye(3)[j] * 1e-5
    J_num[:, j] = (softmax(z + ej) - softmax(z - ej)) / (2e-5)
check_close('Jacobian matches numerical', J, J_num)

# (b) CE gradient = p - e_y
g_analytic = ce_grad(z, y)
f_ce = lambda z_: ce_loss(z_, y)
g_numerical = numerical_gradient(f_ce, z)
check_close('CE gradient = p - e_y', g_analytic, g_numerical)

# (d) Batch gradient
m_b, K = 10, 4
Z = np.random.randn(m_b, K)
Y_idx = np.random.randint(0, K, size=m_b)
Y_onehot = np.eye(K)[Y_idx]

# Batch probabilities
P = np.array([softmax(Z[i]) for i in range(m_b)])
batch_grad = (P - Y_onehot) / m_b

# Numerical check on one weight element
def f_batch(Z_flat):
    Z_ = Z_flat.reshape(m_b, K)
    total = 0.0
    for i in range(m_b):
        total += ce_loss(Z_[i], Y_idx[i])
    return total / m_b

num_batch = numerical_gradient(f_batch, Z.flatten()).reshape(m_b, K)
check_close('batch grad = (P-Y)/m', batch_grad, num_batch, tol=1e-4)
print('\nTakeaway: The elegance of softmax+CE: gradient is simply p-y, '
      'the prediction error — the same formula powers GPT training.')

print("Exercise 7 solution complete.")

Exercise 8 (★★★): Natural Gradient for Gaussian Model

The natural gradient preconditions updates by the inverse Fisher Information Matrix (FIM), making steps geometry-aware in probability space:

~θL=F(θ)1θL\tilde{\nabla}_\theta \mathcal{L} = F(\theta)^{-1} \nabla_\theta \mathcal{L}

For a Gaussian model p(xμ,σ2)p(x|\mu, \sigma^2) with parameters θ=(μ,logσ)\theta = (\mu, \log\sigma), the FIM has a known diagonal form.

(a) Show that for p(xμ,σ)=N(x;μ,σ2)p(x|\mu,\sigma) = \mathcal{N}(x;\mu,\sigma^2), the FIM is:

F=(1/σ2002)F = \begin{pmatrix} 1/\sigma^2 & 0 \\ 0 & 2 \end{pmatrix}

when using parameters (μ,logσ)(\mu, \log\sigma).

(b) Generate N=500N=500 samples from N(3,0.52)\mathcal{N}(3, 0.5^2). Starting from θ0=(0,0)\theta_0 = (0, 0) (i.e. μ0=0,σ0=1\mu_0=0, \sigma_0=1), fit using gradient descent on logp(Dθ)-\log p(\mathcal{D}|\theta) with:

  • Standard gradient descent
  • Natural gradient descent

(c) Plot the parameter trajectories in (μ,logσ)(\mu, \log\sigma) space on top of FIM-scaled contours of the negative log-likelihood. Observe that natural gradient takes a more direct path.

(d) Verify that at convergence, both methods recover μ^3\hat{\mu} \approx 3 and σ^0.5\hat{\sigma} \approx 0.5.

Code cell 26

# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")

Code cell 27

# Solution
# Exercise 8 - reference solution

import numpy as np
np.random.seed(42)

def check_close(name, got, expected, tol=0.05):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'}{name}")
    if not ok: print(f'  expected {expected}, got {got}')

N = 500
mu_true, sigma_true = 3.0, 0.5
data = mu_true + sigma_true * np.random.randn(N)

def neg_log_likelihood(theta, data):
    mu, log_s = theta[0], theta[1]
    sigma = np.exp(log_s)
    return 0.5 * np.sum(((data - mu) / sigma)**2) + N * log_s + 0.5 * N * np.log(2 * np.pi)

def nll_grad(theta, data):
    mu, log_s = theta[0], theta[1]
    sigma = np.exp(log_s)
    # d NLL / d mu = -sum (x_i - mu) / sigma^2
    grad_mu = -np.sum(data - mu) / sigma**2
    # d NLL / d log_sigma = N - sum (x_i-mu)^2 / sigma^2
    grad_ls = N - np.sum((data - mu)**2) / sigma**2
    return np.array([grad_mu, grad_ls])

def fisher_info(theta):
    sigma = np.exp(theta[1])
    return np.diag([N / sigma**2, 2 * N])

def run_optimizer(theta0, data, lr, n_steps, use_natural=False):
    theta = theta0.copy()
    path = [theta.copy()]
    for _ in range(n_steps):
        g = nll_grad(theta, data)
        if use_natural:
            F = fisher_info(theta)
            step = np.linalg.solve(F, g)
        else:
            step = g
        theta -= lr * step
        path.append(theta.copy())
    return np.array(path)

theta0 = np.array([0.0, 0.0])
gd_path  = run_optimizer(theta0, data, lr=1e-4, n_steps=300, use_natural=False)
ng_path  = run_optimizer(theta0, data, lr=0.01, n_steps=300, use_natural=True)

print('=' * 55)
print('Exercise 8: Natural Gradient')
print('=' * 55)

# (d) Verify convergence
gd_mu, gd_sigma = gd_path[-1][0], np.exp(gd_path[-1][1])
ng_mu, ng_sigma = ng_path[-1][0], np.exp(ng_path[-1][1])

print(f'GD  final: mu={gd_mu:.4f}, sigma={gd_sigma:.4f}')
print(f'NG  final: mu={ng_mu:.4f}, sigma={ng_sigma:.4f}')
print(f'True:      mu={mu_true:.4f}, sigma={sigma_true:.4f}')

check_close('GD recovers mu', gd_mu, mu_true, tol=0.1)
check_close('NG recovers mu', ng_mu, mu_true, tol=0.05)
check_close('NG recovers sigma', ng_sigma, sigma_true, tol=0.05)

# (b) Steps to converge: compare NLL values
nll_star = neg_log_likelihood(np.array([mu_true, np.log(sigma_true)]), data)
gd_nlls = [neg_log_likelihood(t, data) - nll_star for t in gd_path]
ng_nlls = [neg_log_likelihood(t, data) - nll_star for t in ng_path]

try:
    import matplotlib.pyplot as plt
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    # Trajectory in parameter space
    mu_g = np.linspace(-0.5, 3.5, 100)
    ls_g = np.linspace(-1.5, 0.8, 100)
    MU, LS = np.meshgrid(mu_g, ls_g)
    NLL = np.array([[neg_log_likelihood(np.array([m, l]), data)
                     for m in mu_g] for l in ls_g])

    ax = axes[0]
    ax.contourf(MU, LS, NLL, levels=30, cmap='viridis', alpha=0.5)
    ax.plot(gd_path[:,0], gd_path[:,1], 'o-', ms=2, lw=1.5, color='#D6604D', label='GD')
    ax.plot(ng_path[:,0], ng_path[:,1], 's-', ms=2, lw=1.5, color='#2166AC', label='Natural GD')
    ax.plot(mu_true, np.log(sigma_true), 'k*', ms=14, label='True params')
    ax.set_xlabel('$\\mu$'); ax.set_ylabel('$\\log\\sigma$')
    ax.set_title('Parameter space trajectories')
    ax.legend(fontsize=9)

    axes[1].semilogy(np.maximum(gd_nlls, 1e-3), color='#D6604D', lw=2, label='GD')
    axes[1].semilogy(np.maximum(ng_nlls, 1e-3), color='#2166AC', lw=2, label='Natural GD')
    axes[1].set_xlabel('Step'); axes[1].set_ylabel('NLL - NLL* (log)')
    axes[1].set_title('NLL convergence')
    axes[1].legend()
    plt.tight_layout()
    plt.savefig('/tmp/natural_gradient.png', dpi=100, bbox_inches='tight')
    plt.show()
except ImportError:
    pass
print('\nTakeaway: Natural gradient removes the ill-conditioning from parameter space; '
      'FIM^{-1}∇L converts Euclidean gradient steps to Riemannian steps on the '
      'statistical manifold — the foundation of K-FAC, EKFAC, and Shampoo optimizers.')

print("Exercise 8 solution complete.")

Exercise 9 (★★★): Softmax Cross-Entropy Gradient

For logits zRKz \in \mathbb{R}^K, probabilities p=softmax(z)p=\operatorname{softmax}(z), and one-hot label yy, the loss is

L(z,y)=iyilogpi.L(z,y)=-\sum_i y_i\log p_i.

Derive zL=py\nabla_z L = p-y and verify it numerically.

Code cell 29

# Your Solution
# Exercise 9 - learner workspace
# Derive and verify the softmax cross-entropy gradient.
print("Learner workspace ready for Exercise 9.")

Code cell 30

# Solution
# Exercise 9 - softmax cross-entropy gradient
header("Exercise 9: softmax cross-entropy gradient")

z = np.array([1.2, -0.3, 0.7, 2.0])
y = np.array([0.0, 0.0, 0.0, 1.0])

def loss(z):
    p = softmax(z)
    return -np.sum(y * np.log(p + 1e-12))

p = softmax(z)
analytic = p - y
numeric = numerical_gradient(loss, z)
print("probabilities:", p)
print("analytic grad:", analytic)
print("numeric grad:", numeric)
check_close("gradient p-y", analytic, numeric, tol=1e-6)
check_close("gradient sums to zero", analytic.sum(), 0.0, tol=1e-10)
print("Takeaway: multiclass backprop starts with prediction error in logit space.")

Exercise 10 (★★★): FGSM Direction as Steepest Loss Increase

For logistic loss (w;x,y)=log(1+exp(ywx))\ell(w;x,y)=\log(1+\exp(-y w^\top x)) with y{1,1}y\in\{-1,1\}, compute x\nabla_x \ell and show that the fastest increase under an \ell_\infty perturbation uses

δ=ϵsign(x).\delta = \epsilon\,\operatorname{sign}(\nabla_x\ell).

Code cell 32

# Your Solution
# Exercise 10 - learner workspace
# Compute the input gradient and compare random perturbations with FGSM.
print("Learner workspace ready for Exercise 10.")

Code cell 33

# Solution
# Exercise 10 - FGSM direction
header("Exercise 10: FGSM steepest direction")

w = np.array([0.8, -1.1, 0.4])
x = np.array([0.2, 0.5, -0.7])
y = 1.0
eps = 0.05

def logistic_loss_x(x):
    margin = y * (w @ x)
    return np.logaddexp(0.0, -margin)

def grad_x(x):
    margin = y * (w @ x)
    return -y * w * sigmoid(-margin)

g = grad_x(x)
delta_fgsm = eps * np.sign(g)
base = logistic_loss_x(x)
fgsm_increase = logistic_loss_x(x + delta_fgsm) - base
rng = np.random.default_rng(42)
random_increases = []
for _ in range(2000):
    delta = rng.uniform(-eps, eps, size=x.shape)
    random_increases.append(logistic_loss_x(x + delta) - base)
print("grad_x:", g)
print(f"FGSM increase={fgsm_increase:.8f}, best random={max(random_increases):.8f}")
check_true("FGSM beats sampled random l_inf perturbations", fgsm_increase >= max(random_increases) - 1e-8)
print("Takeaway: adversarial first-order attacks are directional derivative geometry.")

What to Review After Finishing

  • Partial derivatives — can you compute f/xi\partial f/\partial x_i for polynomial, exponential, and composite functions from the limit definition?
  • Gradient geometry — can you show f\nabla f \perp level sets and that f\nabla f is the steepest ascent direction via Cauchy-Schwarz?
  • Directional derivatives — can you find max/min directions and identify neutral directions?
  • Matrix calculus — do you know the three key identities ((ax)\nabla(\mathbf{a}^\top\mathbf{x}), (xAx)\nabla(\mathbf{x}^\top A\mathbf{x}), Axb2\nabla\|A\mathbf{x}-\mathbf{b}\|^2) and can you prove them?
  • ML gradients — can you derive and implement the MSE and CE gradients?
  • Gradient checking — do you know the relative error formula and when to use it?
  • Softmax + CE gradient — do you understand why the gradient is py\mathbf{p} - \mathbf{y}?
  • Natural gradient — can you explain why standard GD is geometry-blind and how the FIM corrects this?

References

  1. Goodfellow, Bengio & Courville, Deep Learning (2016) — Ch. 4 (Numerical Computation)
  2. Boyd & Vandenberghe, Convex Optimization (2004) — Ch. 3 (Convex Functions)
  3. Amari, Information Geometry and Its Applications (2016) — Ch. 1–2 (Natural Gradient)
  4. Magnus & Neudecker, Matrix Differential Calculus (2019) — canonical reference
  5. Matrix Cookbook — identity reference (equations 61–82 for gradient identities)