Theory NotebookMath for LLMs

Partial Derivatives and Gradients

Multivariate Calculus / Partial Derivatives and Gradients

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Partial Derivatives and Gradients

"The gradient is the generalization of the derivative to functions of several variables, and it points in the direction in which the function increases most rapidly."

Interactive theory notebook covering partial derivatives, gradient fields, directional derivatives, and gradient applications in machine learning.

Topics: Scalar fields → Partial derivatives → Gradient vector → Directional derivatives → ML gradients → Loss landscapes → Numerical differentiation

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import optimize, special, stats
from scipy.optimize import minimize, fsolve, linprog
from math import factorial
import math
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
spmin = minimize

try:
    import torch
    HAS_TORCH = True
except ImportError:
    torch = None
    HAS_TORCH = False


def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok

def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

def sigmoid(x):
    x = np.asarray(x, dtype=float)
    return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z, axis=axis, keepdims=True)
    e = np.exp(z)
    return e / np.sum(e, axis=axis, keepdims=True)

def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return np.where(np.asarray(x) > 0, 1.0, 0.0)

def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def numerical_gradient(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        xp = x.copy(); xm = x.copy()
        xp[idx] += h; xm[idx] -= h
        grad[idx] = (f(xp) - f(xm)) / (2*h)
    return grad

def numerical_jacobian(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        J[:, j] = ((np.asarray(f(xp.reshape(x.shape))) - np.asarray(f(xm.reshape(x.shape)))) / (2*h)).reshape(-1)
    return J.reshape(y0.shape + x.shape)

def grad_check(f, x, analytic_grad, h=1e-6):
    numeric_grad = numerical_gradient(f, x, h=h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def jacobian_fd(f, x, h=1e-6):
    x = np.asarray(x, dtype=float)
    y0 = np.asarray(f(x), dtype=float)
    J = np.zeros((y0.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        yp = np.asarray(f(xp.reshape(x.shape)), dtype=float).reshape(-1)
        ym = np.asarray(f(xm.reshape(x.shape)), dtype=float).reshape(-1)
        J[:, j] = (yp - ym) / (2*h)
    return J.reshape(y0.shape + x.shape)

def hessian_fd(f, x, h=1e-5):
    x = np.asarray(x, dtype=float)
    H = np.zeros((x.size, x.size), dtype=float)
    flat = x.reshape(-1)
    for j in range(x.size):
        xp = flat.copy(); xm = flat.copy()
        xp[j] += h; xm[j] -= h
        gp = numerical_gradient(lambda z: f(z.reshape(x.shape)), xp.reshape(x.shape), h=h).reshape(-1)
        gm = numerical_gradient(lambda z: f(z.reshape(x.shape)), xm.reshape(x.shape), h=h).reshape(-1)
        H[:, j] = (gp - gm) / (2*h)
    return H.reshape(x.shape + x.shape)



def grad_fd(f, x, h=1e-6):
    return numerical_gradient(f, x, h=h)



def fd_grad(f, x, h=1e-6):
    return numerical_gradient(f, np.asarray(x, dtype=float), h=h)

print("Chapter helper setup complete.")

1. Functions of Multiple Variables — Scalar Fields

A scalar field f:RnRf: \mathbb{R}^n \to \mathbb{R} assigns a number to each point. Visualizing f:R2Rf: \mathbb{R}^2 \to \mathbb{R} as a surface z=f(x,y)z = f(x,y) provides geometric intuition for partial derivatives and gradients.

Code cell 5

# === 2.1 Scalar Fields — Surface and Contour Plots ===

x = np.linspace(-3, 3, 80)
y = np.linspace(-3, 3, 80)
X, Y = np.meshgrid(x, y)

# Three classic scalar fields
f_bowl   = X**2 + Y**2                  # paraboloid (convex, unique min)
f_saddle = X**2 - Y**2                  # saddle (min in x, max in y)
f_gauss  = np.exp(-(X**2 + Y**2)/2)     # Gaussian (smooth hill)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Scalar Fields: $f: \\mathbb{R}^2 \\to \\mathbb{R}$', fontsize=15)

for ax, F, title in zip(axes,
                         [f_bowl, f_saddle, f_gauss],
                         ['$f = x^2 + y^2$ (bowl)', '$f = x^2 - y^2$ (saddle)', '$f = e^{-(x^2+y^2)/2}$ (Gaussian)']):
    cp = ax.contourf(X, Y, F, levels=20, cmap='plasma')
    fig.colorbar(cp, ax=ax)
    ax.contour(X, Y, F, levels=10, colors='white', alpha=0.4, linewidths=0.8)
    ax.set_title(title)
    ax.set_xlabel('$x$')
    ax.set_ylabel('$y$')

fig.tight_layout()
plt.show()
print('Plotted three canonical scalar fields.')

2.2 Level Sets and Contour Geometry

The level set Lc(f)={x:f(x)=c}L_c(f) = \{\mathbf{x}: f(\mathbf{x}) = c\} is a curve (in 2D) or hypersurface (in nnD) on which ff is constant. We will verify that fLc(f)\nabla f \perp L_c(f) at every point.

Code cell 7

# === 2.2 Level Sets and Gradient Direction ===

x = np.linspace(-2.5, 2.5, 200)
y = np.linspace(-2.5, 2.5, 200)
X, Y = np.meshgrid(x, y)
F = X**2 + Y**2  # paraboloid

# Gradient field (analytic): nabla f = (2x, 2y)
xg, yg = np.meshgrid(np.linspace(-2, 2, 10), np.linspace(-2, 2, 10))
Gx = 2 * xg  # ∂f/∂x
Gy = 2 * yg  # ∂f/∂y

fig, ax = plt.subplots(figsize=(7, 7))
cp = ax.contour(X, Y, F, levels=[0.5, 1, 2, 4, 7], colors=COLORS['primary'])
ax.clabel(cp, fmt='$f=%.1f$', fontsize=10)
ax.quiver(xg, yg, Gx, Gy, color=COLORS['error'], alpha=0.75,
          label='$\\nabla f = (2x, 2y)^\\top$')
ax.set_title('Gradient $\\nabla f$ is perpendicular to level curves')
ax.set_xlabel('$x$'); ax.set_ylabel('$y$')
ax.set_aspect('equal')
ax.legend()
fig.tight_layout()
plt.show()

# Verify numerically: at (1, 1), gradient = (2,2), tangent direction ~ (-1,1)
grad_at_11 = np.array([2.0, 2.0])
tangent    = np.array([-1.0, 1.0])
dot = np.dot(grad_at_11, tangent)
print(f'nabla f(1,1) = {grad_at_11}')
print(f'Tangent to level curve at (1,1): {tangent}')
print(f'Dot product (should be 0): {dot}')
ok = np.isclose(dot, 0)
print(f'PASS - gradient perpendicular to level set' if ok else 'FAIL')

3. Partial Derivatives

The partial derivative fxi\frac{\partial f}{\partial x_i} measures the rate of change of ff when we move parallel to the xix_i-axis, holding all other variables fixed.

fxi(a)=limh0f(a+hei)f(a)h\frac{\partial f}{\partial x_i}(\mathbf{a}) = \lim_{h \to 0} \frac{f(\mathbf{a} + h\mathbf{e}_i) - f(\mathbf{a})}{h}

Code cell 9

# === 3.1 Partial Derivative via Limit — Numerical Demo ===

def f(x, y):
    return x**2 * y + np.exp(x * y)

# Analytic partials
def df_dx(x, y): return 2*x*y + y*np.exp(x*y)
def df_dy(x, y): return x**2   + x*np.exp(x*y)

a = (1.0, 2.0)
analytic_x = df_dx(*a)
analytic_y = df_dy(*a)

# Numerical limit: shrink h and watch convergence
print('Convergence of forward difference to partial derivative at (1, 2):')
print(f'Analytic ∂f/∂x = {analytic_x:.8f}')
print()
for h in [1e-1, 1e-3, 1e-5, 1e-7, 1e-9]:
    numeric = (f(a[0]+h, a[1]) - f(*a)) / h
    err = abs(numeric - analytic_x)
    print(f'  h={h:.0e}  numeric={numeric:.8f}  error={err:.2e}')

print()
print(f'Analytic ∂f/∂y = {analytic_y:.8f}')
numeric_y = (f(a[0], a[1]+1e-5) - f(*a)) / 1e-5
print(f'Numeric  ∂f/∂y ≈ {numeric_y:.8f}')
ok = np.allclose([analytic_x, analytic_y], [df_dx(*a), df_dy(*a)])
print(f'PASS - analytic formulas verified' if ok else 'FAIL')

Code cell 10

# === 3.1 Geometric Meaning — Cross-Section Slopes ===

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Partial Derivatives as Slopes of Cross-Sections', fontsize=15)

# Left: surface with cross-section lines
x = np.linspace(-2, 2, 100)
y_fixed = 1.0  # fixed y for ∂f/∂x
x_fixed = 0.5  # fixed x for ∂f/∂y

f_cross_x = f(x, y_fixed)  # slice at y = y_fixed
f_cross_y = f(x_fixed, x)  # slice at x = x_fixed

axes[0].plot(x, f_cross_x, color=COLORS['primary'], lw=2.5, label=f'$f(x, {y_fixed})$')
x0 = 1.0
slope_x = df_dx(x0, y_fixed)
tangent_x = f(x0, y_fixed) + slope_x * (x - x0)
axes[0].plot(x, tangent_x, '--', color=COLORS['error'],
             label=f'tangent slope = $\\partial f/\\partial x$ = {slope_x:.2f}')
axes[0].scatter([x0], [f(x0, y_fixed)], color=COLORS['highlight'], s=80, zorder=5)
axes[0].set_title(f'Slice at $y={y_fixed}$: slope = $\\partial f/\\partial x$')
axes[0].set_xlabel('$x$'); axes[0].set_ylabel('$f(x, y_0)$')
axes[0].legend(fontsize=10)

axes[1].plot(x, f_cross_y, color=COLORS['secondary'], lw=2.5, label=f'$f({x_fixed}, y)$')
y0 = 2.0
slope_y = df_dy(x_fixed, y0)
tangent_y = f(x_fixed, y0) + slope_y * (x - y0)
axes[1].plot(x, tangent_y, '--', color=COLORS['error'],
             label=f'tangent slope = $\\partial f/\\partial y$ = {slope_y:.2f}')
axes[1].scatter([y0], [f(x_fixed, y0)], color=COLORS['highlight'], s=80, zorder=5)
axes[1].set_title(f'Slice at $x={x_fixed}$: slope = $\\partial f/\\partial y$')
axes[1].set_xlabel('$y$'); axes[1].set_ylabel('$f(x_0, y)$')
axes[1].legend(fontsize=10)

fig.tight_layout()
plt.show()
print(f'∂f/∂x at (1.0, 1.0) = {df_dx(1.0,1.0):.4f}')
print(f'∂f/∂y at (0.5, 2.0) = {df_dy(0.5,2.0):.4f}')

Code cell 11

# === 3.3-3.4 Mixed Partials — Clairaut's Theorem ===

# f(x,y) = x^3 * y^2 + sin(xy)
# ∂f/∂x = 3x^2 y^2 + y cos(xy)
# ∂f/∂y = 2x^3 y + x cos(xy)
# ∂²f/∂y∂x = 6x^2 y + cos(xy) - xy sin(xy)
# ∂²f/∂x∂y = 6x^2 y + cos(xy) - xy sin(xy)  (same!)

def f2(x, y): return x**3 * y**2 + np.sin(x*y)

def d2f_dydx(x, y): return 6*x**2*y + np.cos(x*y) - x*y*np.sin(x*y)
def d2f_dxdy(x, y): return 6*x**2*y + np.cos(x*y) - x*y*np.sin(x*y)

# Verify via finite differences
h = 1e-5
pts = [(1.0, 2.0), (0.5, -1.0), (np.pi, 1.0)]
print('Verification of Clairaut\'s theorem: ∂²f/∂y∂x = ∂²f/∂x∂y')
print(f'{'Point':>20}  {'∂²/∂y∂x':>12}  {'∂²/∂x∂y':>12}  {'Match':>8}')
for (x0, y0) in pts:
    # Numerical ∂²f/∂y∂x: first diff w.r.t. x, then diff w.r.t. y
    df_dx_at_yph = (f2(x0+h, y0+h) - f2(x0, y0+h)) / h
    df_dx_at_y   = (f2(x0+h, y0)   - f2(x0, y0))   / h
    mixed_yx = (df_dx_at_yph - df_dx_at_y) / h

    df_dy_at_xph = (f2(x0+h, y0+h) - f2(x0+h, y0)) / h
    df_dy_at_x   = (f2(x0, y0+h)   - f2(x0, y0))   / h
    mixed_xy = (df_dy_at_xph - df_dy_at_x) / h

    match = np.isclose(mixed_yx, mixed_xy, rtol=1e-4)
    print(f'  ({x0:.2f}, {y0:.2f}): {mixed_yx:>12.6f}  {mixed_xy:>12.6f}  {"PASS" if match else "FAIL"}')

print('\nClairaut\'s theorem: mixed partials are equal for C^2 functions.')

4. The Gradient Vector

The gradient f(a)Rn\nabla f(\mathbf{a}) \in \mathbb{R}^n collects all partial derivatives into a single column vector:

f(a)=(f/x1f/xn)\nabla f(\mathbf{a}) = \begin{pmatrix} \partial f/\partial x_1 \\ \vdots \\ \partial f/\partial x_n \end{pmatrix}

Key theorem: f(a)\nabla f(\mathbf{a}) points in the direction of steepest ascent, with magnitude equal to the maximum directional derivative.

Code cell 13

# === 4.5 Gradient of Standard ML Forms ===

import numpy.linalg as la

np.random.seed(42)
n = 5
x = np.random.randn(n)
a = np.random.randn(n)
A = np.random.randn(n, n)
A_sym = (A + A.T) / 2  # make symmetric

print('=== Gradient formulas verification ===')

# 1. nabla(a^T x) = a
f1 = lambda x: a @ x
grad1_analytic = a.copy()
grad1_numeric = np.array([(f1(x + 1e-5*np.eye(n)[i]) - f1(x - 1e-5*np.eye(n)[i])) / 2e-5
                          for i in range(n)])
ok1 = np.allclose(grad1_analytic, grad1_numeric, atol=1e-5)
print(f'PASS - nabla(a^T x) = a' if ok1 else f'FAIL - nabla(a^T x)')

# 2. nabla(x^T x) = 2x
f2 = lambda x: x @ x
grad2_analytic = 2 * x
grad2_numeric = np.array([(f2(x + 1e-5*np.eye(n)[i]) - f2(x - 1e-5*np.eye(n)[i])) / 2e-5
                          for i in range(n)])
ok2 = np.allclose(grad2_analytic, grad2_numeric, atol=1e-5)
print(f'PASS - nabla(x^T x) = 2x' if ok2 else f'FAIL - nabla(x^T x)')

# 3. nabla(x^T A x) = (A + A^T) x
f3 = lambda x: x @ A @ x
grad3_analytic = (A + A.T) @ x
grad3_numeric = np.array([(f3(x + 1e-5*np.eye(n)[i]) - f3(x - 1e-5*np.eye(n)[i])) / 2e-5
                          for i in range(n)])
ok3 = np.allclose(grad3_analytic, grad3_numeric, atol=1e-5)
print(f'PASS - nabla(x^T A x) = (A+A^T)x' if ok3 else f'FAIL - nabla(x^T A x)')

# 4. nabla(x^T A_sym x) = 2 A_sym x
f4 = lambda x: x @ A_sym @ x
grad4_analytic = 2 * A_sym @ x
grad4_numeric = np.array([(f4(x + 1e-5*np.eye(n)[i]) - f4(x - 1e-5*np.eye(n)[i])) / 2e-5
                          for i in range(n)])
ok4 = np.allclose(grad4_analytic, grad4_numeric, atol=1e-5)
print(f'PASS - nabla(x^T A_sym x) = 2 A_sym x (A sym)' if ok4 else 'FAIL')

# 5. nabla ||Ax - b||^2 = 2 A^T(Ax - b)
m = 8
B = np.random.randn(m, n)
b = np.random.randn(m)
f5 = lambda x: la.norm(B @ x - b)**2
grad5_analytic = 2 * B.T @ (B @ x - b)
grad5_numeric = np.array([(f5(x + 1e-5*np.eye(n)[i]) - f5(x - 1e-5*np.eye(n)[i])) / 2e-5
                          for i in range(n)])
ok5 = np.allclose(grad5_analytic, grad5_numeric, atol=1e-4)
print(f'PASS - nabla ||Bx-b||^2 = 2B^T(Bx-b)' if ok5 else 'FAIL')

4.2 The Gradient Points in the Direction of Steepest Ascent

Proof via Cauchy-Schwarz:

Duf=fufu=fD_{\mathbf{u}} f = \nabla f^\top \mathbf{u} \leq \lVert\nabla f\rVert \cdot \lVert\mathbf{u}\rVert = \lVert\nabla f\rVert

Equality holds when u=f/f\mathbf{u} = \nabla f / \lVert\nabla f\rVert. The gradient IS the direction of steepest ascent.

Code cell 15

# === 4.2 Steepest Ascent Direction Demo ===

import numpy as np
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'error': '#CC3311',
          'neutral': '#555555', 'highlight': '#EE3377'}

def f_test(x, y): return np.sin(x) * np.cos(y) + 0.3*(x**2 + y**2)
def grad_f(x, y):
    return np.array([np.cos(x)*np.cos(y) + 0.6*x,
                    -np.sin(x)*np.sin(y) + 0.6*y])

# At a specific point, compute directional derivatives in many directions
a = np.array([1.0, 0.5])
g = grad_f(*a)
grad_norm = np.linalg.norm(g)

thetas = np.linspace(0, 2*np.pi, 360)
dir_derivs = []
for theta in thetas:
    u = np.array([np.cos(theta), np.sin(theta)])
    dir_derivs.append(g @ u)  # = ||g|| cos(angle)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Directional Derivative vs. Direction', fontsize=15)

axes[0].plot(np.degrees(thetas), dir_derivs, color=COLORS['primary'])
max_idx = np.argmax(dir_derivs)
axes[0].axvline(np.degrees(thetas[max_idx]), color=COLORS['error'],
                ls='--', label=f'Max at θ={np.degrees(thetas[max_idx]):.1f}°')
axes[0].axhline(grad_norm, color=COLORS['tertiary'],
                ls=':', label=f'||∇f|| = {grad_norm:.3f}')
axes[0].set_xlabel('Direction angle θ (degrees)')
axes[0].set_ylabel('$D_u f$')
axes[0].set_title('Directional derivative at $(1, 0.5)$')
axes[0].legend()

# Right: gradient field
xg, yg = np.meshgrid(np.linspace(-2, 2, 12), np.linspace(-2, 2, 12))
Gx = np.cos(xg)*np.cos(yg) + 0.6*xg
Gy = -np.sin(xg)*np.sin(yg) + 0.6*yg
axes[1].quiver(xg, yg, Gx, Gy, color=COLORS['primary'], alpha=0.75)
axes[1].scatter(*a, color=COLORS['error'], s=100, zorder=5, label='Query point')
axes[1].arrow(*a, *(0.4*g/grad_norm), head_width=0.08, color=COLORS['error'])
axes[1].set_title('Gradient field $\\nabla f$')
axes[1].set_xlabel('$x$'); axes[1].set_ylabel('$y$')
axes[1].legend()
fig.tight_layout()
plt.show()

grad_dir = g / grad_norm
max_dir  = np.array([np.cos(thetas[max_idx]), np.sin(thetas[max_idx])])
print(f'Gradient direction: {grad_dir}')
print(f'Direction of max D_u f: {max_dir}')
print(f'PASS - match' if np.allclose(grad_dir, max_dir, atol=0.01) else 'FAIL')

5. Directional Derivatives

The directional derivative Duf(a)=f(a)uD_{\mathbf{u}} f(\mathbf{a}) = \nabla f(\mathbf{a})^\top \mathbf{u} gives the rate of change in any direction u\mathbf{u} (unit vector). It is maximized in direction ^f\hat{\nabla} f and zero along level sets.

Code cell 17

# === 5.3 Extremizing the Directional Derivative ===

import numpy as np
import numpy.linalg as la

# f(x,y) = e^(x^2 + y^2) at a = (1, 0)
def f_exp(x, y): return np.exp(x**2 + y**2)
def grad_exp(x, y): return np.array([2*x*np.exp(x**2+y**2),
                                      2*y*np.exp(x**2+y**2)])

a = (1.0, 0.0)
g = grad_exp(*a)
print(f'f(1,0) = {f_exp(*a):.4f}')
print(f'nabla f(1,0) = {g}')
print(f'||nabla f(1,0)|| = {la.norm(g):.4f}')

# Sample directions and compute directional derivatives
test_dirs = {
    'Steepest ascent (grad dir)': g / la.norm(g),
    'North (0,1)':                np.array([0., 1.]),
    'East (1,0)':                 np.array([1., 0.]),
    'Steepest descent':           -g / la.norm(g),
    'Along level set':            np.array([0., 1.]) if np.isclose(g[1],0) else
                                  np.array([-g[1], g[0]]) / la.norm(g),
}

print()
print(f'{'Direction':>30}  {'u':>14}  {'D_u f':>8}')
for name, u in test_dirs.items():
    u = u / la.norm(u)  # ensure unit
    Du = g @ u
    print(f'  {name:>28}  {str(u.round(3)):>14}  {Du:>8.4f}')

print(f'\nMax D_u f = ||grad|| = {la.norm(g):.4f}')
ok = np.isclose(max([g@(u/la.norm(u)) for u in test_dirs.values()]), la.norm(g), atol=0.01)
print(f'PASS - maximum equals gradient norm' if ok else 'FAIL')

6. Gradient in Machine Learning

We compute and verify gradients for the most important ML loss functions.

Code cell 19

# === 6.1 MSE Gradient — Derivation and Verification ===

import numpy as np
import numpy.linalg as la

np.random.seed(42)
m, d = 50, 4
X = np.random.randn(m, d)
w_true = np.array([2.0, -1.0, 0.5, 1.5])
y = X @ w_true + 0.1 * np.random.randn(m)

def mse_loss(w):
    return (1/m) * la.norm(X @ w - y)**2

def mse_grad(w):
    return (2/m) * X.T @ (X @ w - y)

w0 = np.zeros(d)

# Verify gradient via centered differences
h = 1e-5
num_grad = np.array([(mse_loss(w0 + h*np.eye(d)[i]) - mse_loss(w0 - h*np.eye(d)[i])) / (2*h)
                     for i in range(d)])
anal_grad = mse_grad(w0)

print('MSE gradient verification:')
print(f'Analytic:  {anal_grad}')
print(f'Numerical: {num_grad}')
rel_err = la.norm(anal_grad - num_grad) / (la.norm(anal_grad) + la.norm(num_grad))
print(f'Relative error: {rel_err:.2e}')
print(f'PASS - MSE gradient correct' if rel_err < 1e-6 else 'FAIL')

# Verify: gradient = 0 at OLS solution
w_ols = la.lstsq(X, y, rcond=None)[0]
grad_at_ols = mse_grad(w_ols)
print(f'\n||nabla L at OLS solution|| = {la.norm(grad_at_ols):.2e} (should be ~0)')
print(f'PASS - gradient zero at OLS' if la.norm(grad_at_ols) < 1e-10 else 'FAIL')

Code cell 20

# === 6.2 Logistic Regression Gradient ===

import numpy as np
import numpy.linalg as la

np.random.seed(42)
m, d = 100, 3
X_cls = np.random.randn(m, d)
w_cls_true = np.array([1.0, -2.0, 0.5])
logits = X_cls @ w_cls_true
y_cls = (logits + 0.5 * np.random.randn(m) > 0).astype(float)

def sigmoid(z): return 1.0 / (1.0 + np.exp(-np.clip(z, -30, 30)))

def ce_loss(w):
    p = sigmoid(X_cls @ w)
    return -(1/m) * np.sum(y_cls * np.log(p + 1e-10) + (1-y_cls) * np.log(1-p + 1e-10))

def ce_grad(w):
    p = sigmoid(X_cls @ w)
    return (1/m) * X_cls.T @ (p - y_cls)  # elegant formula!

w0 = np.zeros(d)
h = 1e-5
num_grad = np.array([(ce_loss(w0 + h*np.eye(d)[i]) - ce_loss(w0 - h*np.eye(d)[i])) / (2*h)
                     for i in range(d)])
anal_grad = ce_grad(w0)

rel_err = la.norm(anal_grad - num_grad) / (la.norm(anal_grad) + la.norm(num_grad))
print('Cross-entropy gradient: (1/m) X^T (p - y)')
print(f'Analytic:  {anal_grad}')
print(f'Numerical: {num_grad}')
print(f'Relative error: {rel_err:.2e}')
print(f'PASS - CE gradient correct' if rel_err < 1e-6 else 'FAIL')

Code cell 21

# === 6.5 Gradient Descent on MSE Loss ===

import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'error': '#CC3311',
          'neutral': '#555555', 'highlight': '#EE3377'}

np.random.seed(42)
m, d = 50, 4
X = np.random.randn(m, d)
w_true = np.array([2.0, -1.0, 0.5, 1.5])
y = X @ w_true + 0.1 * np.random.randn(m)
def mse_loss(w): return (1/m) * la.norm(X @ w - y)**2
def mse_grad(w): return (2/m) * X.T @ (X @ w - y)

# Run gradient descent with multiple learning rates
T = 200
lrs = [0.001, 0.01, 0.1]
fig, ax = plt.subplots(figsize=(10, 5))

for lr, color in zip(lrs, [COLORS['primary'], COLORS['secondary'], COLORS['error']]):
    w = np.zeros(d)
    losses = [mse_loss(w)]
    for _ in range(T):
        w -= lr * mse_grad(w)
        losses.append(mse_loss(w))
    ax.semilogy(losses, color=color, label=f'$\\eta={lr}$')

# OLS optimal loss (lower bound)
w_ols = la.lstsq(X, y, rcond=None)[0]
ax.axhline(mse_loss(w_ols), ls='--', color=COLORS['neutral'], label='OLS optimum')
ax.set_title('Gradient Descent on MSE — Effect of Learning Rate')
ax.set_xlabel('Step $t$'); ax.set_ylabel('Loss (log scale)')
ax.legend()
fig.tight_layout()
plt.show()

# Verify convergence
w = np.zeros(d)
for _ in range(500): w -= 0.01 * mse_grad(w)
print(f'GD solution:  {w}')
print(f'OLS solution: {w_ols}')
ok = np.allclose(w, w_ols, atol=0.01)
print(f'PASS - GD converges to OLS solution' if ok else 'FAIL')

Code cell 22

# === 6.5 2D Loss Landscape with Gradient Descent Path ===

import numpy as np
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'error': '#CC3311',
          'neutral': '#555555', 'highlight': '#EE3377'}

# 2D quadratic loss: L(w1, w2) = (w1 - 1)^2 + 10*(w2 - 1)^2
# (elongated ellipse — ill-conditioned)
def loss_2d(w): return (w[0]-1)**2 + 10*(w[1]-1)**2
def grad_2d(w): return np.array([2*(w[0]-1), 20*(w[1]-1)])

w1, w2 = np.meshgrid(np.linspace(-1, 3, 200), np.linspace(-1, 3, 200))
L = (w1-1)**2 + 10*(w2-1)**2

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
fig.suptitle('Gradient Descent on Elongated Loss Landscape', fontsize=15)

for ax, lr, title in zip(axes, [0.05, 0.09],
                          ['$\\eta=0.05$ (slow but stable)', '$\\eta=0.09$ (near oscillation)']):
    cp = ax.contourf(w1, w2, L, levels=30, cmap='plasma')
    fig.colorbar(cp, ax=ax)
    ax.contour(w1, w2, L, levels=10, colors='white', alpha=0.3)

    # Run GD
    w = np.array([-0.5, -0.5])
    path = [w.copy()]
    for _ in range(40):
        w = w - lr * grad_2d(w)
        path.append(w.copy())
    path = np.array(path)

    ax.plot(path[:,0], path[:,1], 'o-', color=COLORS['error'],
            ms=3, lw=1.5, label='GD path')
    ax.scatter([1], [1], marker='*', s=200, color='gold', zorder=5, label='Minimum (1,1)')
    ax.set_xlim(-1, 3); ax.set_ylim(-1, 3)
    ax.set_xlabel('$w_1$'); ax.set_ylabel('$w_2$')
    ax.set_title(title)
    ax.legend(fontsize=9)

fig.tight_layout()
plt.show()
print('Elongated contours cause GD to zigzag — motivates Adam and preconditioning.')

7. Numerical Differentiation

Gradient checking uses centered finite differences to verify analytical gradient implementations.

fxi(a)f(a+hei)f(ahei)2h+O(h2)\frac{\partial f}{\partial x_i}(\mathbf{a}) \approx \frac{f(\mathbf{a}+h\mathbf{e}_i) - f(\mathbf{a}-h\mathbf{e}_i)}{2h} + O(h^2)

Code cell 24

# === 7.1-7.2 Finite Difference Error vs. Step Size ===

import numpy as np
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'error': '#CC3311',
          'neutral': '#555555', 'highlight': '#EE3377'}

# f(x) = exp(sin(x^2)) at x = 1.0
f   = lambda x: np.exp(np.sin(x**2))
df  = lambda x: np.exp(np.sin(x**2)) * np.cos(x**2) * 2*x  # analytic

x0 = 1.0
true_grad = df(x0)
h_vals = np.logspace(-12, 0, 100)

fwd_err, cen_err = [], []
for h in h_vals:
    fwd = (f(x0 + h) - f(x0)) / h
    cen = (f(x0 + h) - f(x0 - h)) / (2*h)
    fwd_err.append(abs(fwd - true_grad))
    cen_err.append(abs(cen - true_grad))

fig, ax = plt.subplots(figsize=(10, 6))
ax.loglog(h_vals, fwd_err, color=COLORS['secondary'], label='Forward diff: $O(h)$ error')
ax.loglog(h_vals, cen_err, color=COLORS['primary'], label='Centered diff: $O(h^2)$ error')
ax.axvline(1e-5, ls='--', color=COLORS['neutral'], label='Optimal $h \\approx 10^{-5}$')

# Reference lines
h_ref = np.logspace(-7, -1, 20)
ax.loglog(h_ref, 2e-8*h_ref, 'k-', alpha=0.4, label='$O(h)$ slope')
ax.loglog(h_ref, 2e-8*h_ref**2, 'k--', alpha=0.4, label='$O(h^2)$ slope')

ax.set_title('Numerical Gradient Error vs. Step Size $h$')
ax.set_xlabel('Step size $h$')
ax.set_ylabel('|error|')
ax.legend()
fig.tight_layout()
plt.show()

# Find optimal h for centered differences
opt_idx = np.argmin(cen_err)
print(f'Optimal h for centered differences: {h_vals[opt_idx]:.2e}')
print(f'Minimum error: {cen_err[opt_idx]:.2e}')
print(f'\nKey insight: optimal h ≈ epsilon_mach^(1/3) ≈ {np.cbrt(np.finfo(float).eps):.2e}')

Code cell 25

# === 7.3 Gradient Checking Implementation ===

import numpy as np
import numpy.linalg as la

def numerical_gradient(f, x, h=1e-5):
    """Centered-difference gradient for f: R^n -> R."""
    n = len(x)
    grad = np.zeros(n)
    for i in range(n):
        e = np.zeros(n); e[i] = 1.0
        grad[i] = (f(x + h*e) - f(x - h*e)) / (2*h)
    return grad

def gradient_check(f, grad_f, x, h=1e-5, tol=1e-5):
    """Compare analytic and numeric gradients. Returns relative error."""
    g_anal = grad_f(x)
    g_num  = numerical_gradient(f, x, h)
    rel_err = la.norm(g_anal - g_num) / (la.norm(g_anal) + la.norm(g_num) + 1e-15)
    status = 'PASS' if rel_err < tol else 'FAIL'
    return rel_err, status, g_anal, g_num

# Test on several functions
np.random.seed(42)
n = 6
x0 = np.random.randn(n)
A = np.random.randn(n, n)
A_sym = (A + A.T) / 2
b = np.random.randn(n)

tests = [
    ('a^T x',       lambda x: b @ x,               lambda x: b),
    ('x^T x',       lambda x: x @ x,               lambda x: 2*x),
    ('x^T A_sym x', lambda x: x @ A_sym @ x,       lambda x: 2*A_sym @ x),
    ('||Ax-b||^2',  lambda x: la.norm(A@x-b)**2,   lambda x: 2*A.T@(A@x-b)),
    ('log(|x|^2)',  lambda x: np.sum(np.log(x**2+1)), lambda x: 2*x/(x**2+1)),
]

print(f'{'Function':>20}  {'Rel. Error':>12}  Status')
for name, f_fn, g_fn in tests:
    rel_err, status, _, _ = gradient_check(f_fn, g_fn, x0)
    print(f'  {name:>18}  {rel_err:>12.2e}  {status}')

print('\nGradient checking: centered differences with h=1e-5 achieves ~1e-10 accuracy.')

8. Loss Landscape Geometry

Neural network loss functions are non-convex surfaces with local minima, saddle points, and flat regions. The gradient norm and Hessian eigenvalues describe the local geometry.

Code cell 27

# === 8.1 Non-Convex Loss Landscape Visualization ===

import numpy as np
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'error': '#CC3311',
          'neutral': '#555555', 'highlight': '#EE3377'}

# Modified Beale function (non-convex, multiple saddle points)
def beale(x, y):
    t1 = (1.5   - x + x*y)**2
    t2 = (2.25  - x + x*y**2)**2
    t3 = (2.625 - x + x*y**3)**2
    return t1 + t2 + t3

x = np.linspace(-3, 3, 300)
y = np.linspace(-1.5, 1.5, 300)
X, Y = np.meshgrid(x, y)
Z = beale(X, Y)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Contour plot
cp = axes[0].contourf(X, Y, np.log1p(Z), levels=40, cmap='plasma')
fig.colorbar(cp, ax=axes[0], label='log(1 + Loss)')
axes[0].contour(X, Y, np.log1p(Z), levels=15, colors='white', alpha=0.3, lw=0.5)
axes[0].scatter([3], [0.5], marker='*', s=300, color='gold', zorder=5, label='Global min')
axes[0].set_title('Non-convex loss landscape (Beale function)')
axes[0].set_xlabel('$w_1$'); axes[0].set_ylabel('$w_2$')
axes[0].legend()

# Gradient descent from multiple starts
def grad_beale(x, y, h=1e-5):
    gx = (beale(x+h, y) - beale(x-h, y)) / (2*h)
    gy = (beale(x, y+h) - beale(x, y-h)) / (2*h)
    return gx, gy

cp2 = axes[1].contourf(X, Y, np.log1p(Z), levels=40, cmap='plasma')
starts = [(-2, -1), (-1, 1), (0.5, -1), (1.5, 0.5)]
path_colors = [COLORS['error'], COLORS['tertiary'], COLORS['primary'], COLORS['highlight']]

for (x0, y0), color in zip(starts, path_colors):
    px, py = [x0], [y0]
    x_c, y_c = x0, y0
    for _ in range(200):
        gx, gy = grad_beale(x_c, y_c)
        norm = np.sqrt(gx**2 + gy**2) + 1e-10
        x_c -= 0.005 * gx
        y_c -= 0.005 * gy
        px.append(x_c); py.append(y_c)
    axes[1].plot(px, py, '-', color=color, alpha=0.8, lw=1.5,
                 label=f'start ({x0},{y0})')

axes[1].scatter([3], [0.5], marker='*', s=300, color='gold', zorder=5, label='Global min')
axes[1].set_xlim(-3, 3); axes[1].set_ylim(-1.5, 1.5)
axes[1].set_title('GD trajectories (different initializations)')
axes[1].set_xlabel('$w_1$'); axes[1].set_ylabel('$w_2$')
axes[1].legend(fontsize=8)
fig.tight_layout()
plt.show()
print('Non-convex landscape: different initializations converge to different local minima.')

Code cell 28

# === 8.2-8.3 Gradient Norm as Diagnostic + Gradient Clipping ===

import numpy as np
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'error': '#CC3311',
          'neutral': '#555555', 'highlight': '#EE3377'}

np.random.seed(42)

# Simulate gradient descent on a simple problem with gradient clipping
m, d = 100, 10
X = np.random.randn(m, d)
w_true = np.random.randn(d)
y = X @ w_true + 0.1 * np.random.randn(m)

def loss(w): return (1/m) * np.sum((X@w - y)**2)
def grad(w): return (2/m) * X.T @ (X@w - y)

def clip_grad(g, max_norm=1.0):
    norm = np.linalg.norm(g)
    if norm > max_norm:
        return g * max_norm / norm, norm
    return g, norm

T = 300
results = {}

for clip_val in [None, 0.5, 2.0]:
    w = np.ones(d) * 3  # bad init
    losses_t, norms_t = [], []
    for t in range(T):
        g = grad(w)
        if clip_val is not None:
            g, norm = clip_grad(g, clip_val)
        else:
            norm = np.linalg.norm(g)
        losses_t.append(loss(w))
        norms_t.append(norm)
        w -= 0.1 * g
    results[clip_val] = (losses_t, norms_t)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
labels = {None: 'No clipping', 0.5: 'Clip norm=0.5', 2.0: 'Clip norm=2.0'}
colors = [COLORS['primary'], COLORS['error'], COLORS['tertiary']]

for (clip_val, (ls, ns)), color in zip(results.items(), colors):
    axes[0].semilogy(ls, color=color, label=labels[clip_val])
    axes[1].semilogy(ns, color=color, label=labels[clip_val])

axes[0].set_title('Loss vs. Training Step'); axes[0].set_xlabel('Step')
axes[0].set_ylabel('Loss'); axes[0].legend()
axes[1].set_title('Gradient Norm vs. Step'); axes[1].set_xlabel('Step')
axes[1].set_ylabel('||∇L||'); axes[1].legend()
axes[1].axhline(1.0, ls='--', color=COLORS['neutral'], label='Clip threshold 1.0')
fig.tight_layout()
plt.show()

print('Gradient clipping: rescales gradient direction when norm exceeds threshold.')
print('Direction preserved. Used in all LLM training (GPT-3, Llama, Mistral).')

9. Advanced Topics

9.3 Natural Gradient vs. Gradient Descent

The Fisher information matrix F(θ)F(\boldsymbol{\theta}) defines a Riemannian metric on the statistical manifold. The natural gradient F1LF^{-1}\nabla\mathcal{L} accounts for the non-Euclidean geometry and often converges faster than ordinary gradient descent.

Code cell 30

# === 9.3 Natural Gradient vs. Gradient Descent ===

import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
          'tertiary': '#009988', 'error': '#CC3311',
          'neutral': '#555555', 'highlight': '#EE3377'}

np.random.seed(42)
# Fit Gaussian: theta = (mu, log_sigma)
# True params: mu=2, sigma=1 (log_sigma=0)
n_data = 200
x_data = np.random.normal(2.0, 1.0, n_data)

def neg_loglik(theta):
    mu, log_sig = theta
    sig = np.exp(log_sig)
    return 0.5*np.log(2*np.pi) + log_sig + 0.5*np.mean((x_data - mu)**2) / sig**2

def grad_nll(theta):
    mu, log_sig = theta
    sig = np.exp(log_sig)
    sig2 = sig**2
    g_mu  = -np.mean(x_data - mu) / sig2
    g_ls  = 1 - np.mean((x_data - mu)**2) / sig2
    return np.array([g_mu, g_ls])

def fisher_gaussian(theta):
    mu, log_sig = theta
    sig = np.exp(log_sig)
    # F = diag(1/sig^2, 2)
    return np.diag([1/sig**2, 2.0])

theta0 = np.array([0.0, 1.0])  # start: mu=0, sigma=exp(1)≈2.7
T = 100

# Gradient descent
th_gd = theta0.copy()
path_gd = [th_gd.copy()]
for _ in range(T):
    th_gd = th_gd - 0.1 * grad_nll(th_gd)
    path_gd.append(th_gd.copy())
path_gd = np.array(path_gd)

# Natural gradient descent
th_ng = theta0.copy()
path_ng = [th_ng.copy()]
for _ in range(T):
    g = grad_nll(th_ng)
    F = fisher_gaussian(th_ng)
    nat_g = la.solve(F, g)  # F^{-1} g
    th_ng = th_ng - 0.1 * nat_g
    path_ng.append(th_ng.copy())
path_ng = np.array(path_ng)

# Plot trajectories in (mu, log_sigma) space
fig, ax = plt.subplots(figsize=(8, 7))
ax.plot(path_gd[:,0], path_gd[:,1], 'o-', color=COLORS['primary'],
        ms=3, label='Gradient descent')
ax.plot(path_ng[:,0], path_ng[:,1], 's-', color=COLORS['error'],
        ms=3, label='Natural gradient descent')
ax.scatter([2.0], [0.0], marker='*', s=300, color='gold', zorder=5, label='True params')
ax.scatter(*theta0, s=100, color=COLORS['neutral'], zorder=5, label='Start')
ax.set_xlabel('$\\mu$'); ax.set_ylabel('$\\log\\sigma$')
ax.set_title('GD vs. Natural Gradient in parameter space')
ax.legend()
fig.tight_layout()
plt.show()

print(f'GD final:  mu={path_gd[-1,0]:.3f}, log_sigma={path_gd[-1,1]:.3f}')
print(f'NG final:  mu={path_ng[-1,0]:.3f}, log_sigma={path_ng[-1,1]:.3f}')
print(f'True:      mu=2.000, log_sigma=0.000')
ng_closer = la.norm(path_ng[-1] - [2,0]) < la.norm(path_gd[-1] - [2,0])
print(f'PASS - natural gradient converges faster' if ng_closer else 'Both converged equally')

9.4 Softmax Gradient Derivation

The Jacobian of softmax(z)k=ezk/jezj\operatorname{softmax}(\mathbf{z})_k = e^{z_k}/\sum_j e^{z_j} is:

pkzj=pk(δkjpj)\frac{\partial p_k}{\partial z_j} = p_k(\delta_{kj} - p_j)

Combined with cross-entropy, the gradient simplifies to py\mathbf{p} - \mathbf{y}.

Code cell 32

# === Softmax Jacobian and Cross-Entropy Gradient ===

import numpy as np
import numpy.linalg as la

def softmax(z):
    z = z - z.max()  # numerical stability
    e = np.exp(z)
    return e / e.sum()

def softmax_jacobian(p):
    """J[k,j] = p_k * (delta_kj - p_j)"""
    return np.diag(p) - np.outer(p, p)

def cross_entropy(z, y_onehot):
    p = softmax(z)
    return -np.sum(y_onehot * np.log(p + 1e-15))

def ce_grad_z(z, y_onehot):
    return softmax(z) - y_onehot  # elegant!

np.random.seed(42)
K = 5
z = np.random.randn(K)
y_true = np.zeros(K); y_true[2] = 1.0  # one-hot label for class 2

# Verify gradient
h = 1e-5
num_grad = np.array([(cross_entropy(z + h*np.eye(K)[i], y_true)
                      - cross_entropy(z - h*np.eye(K)[i], y_true)) / (2*h)
                     for i in range(K)])
anal_grad = ce_grad_z(z, y_true)

print('Softmax cross-entropy gradient verification:')
print(f'Analytic  (p - y): {anal_grad.round(4)}')
print(f'Numerical (FD):    {num_grad.round(4)}')
rel_err = la.norm(anal_grad - num_grad) / (la.norm(anal_grad) + la.norm(num_grad))
print(f'Relative error: {rel_err:.2e}')
print(f'PASS - softmax CE gradient correct' if rel_err < 1e-6 else 'FAIL')

# Softmax Jacobian
p = softmax(z)
J = softmax_jacobian(p)
print(f'\nSoftmax Jacobian shape: {J.shape}')
print(f'Jacobian is symmetric: {np.allclose(J, J.T)}')
print(f'Row sums = 0: {np.allclose(J.sum(axis=1), 0)}')
print(f'\nTakeaway: grad of CE + softmax = p - y. Elegant, stable, used in every transformer.')

8. Higher-Order Partial Derivatives and the Hessian Preview

Second-order partial derivatives measure the curvature of a scalar field — how fast the gradient itself changes. The matrix of all second partials is the Hessian HfH_f, which characterises loss landscape geometry.

Preview: The Hessian is covered in full in §02 — Jacobians and Hessians. Here we only compute it numerically and observe its role.

Code cell 34

# === 8.1 Numerical Hessian via Finite Differences ===

import numpy as np
np.random.seed(42)

def numerical_hessian(f, x, h=1e-4):
    """Compute Hessian via second-order central differences."""
    n = len(x)
    H = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            ei, ej = np.eye(n)[i], np.eye(n)[j]
            H[i, j] = (f(x + h*ei + h*ej) - f(x + h*ei - h*ej)
                       - f(x - h*ei + h*ej) + f(x - h*ei - h*ej)) / (4*h**2)
    return H

# Test on quadratic f(x,y) = x^2 + 3xy + 2y^2 → H = [[2,3],[3,4]]
f_quad = lambda x: x[0]**2 + 3*x[0]*x[1] + 2*x[1]**2
x0 = np.array([1.0, 2.0])
H_num = numerical_hessian(f_quad, x0)
H_exact = np.array([[2., 3.], [3., 4.]])
print('Numerical Hessian:\n', H_num)
print('Exact Hessian:    \n', H_exact)
ok = np.allclose(H_num, H_exact, atol=1e-6)
print(f'\nPASS: {ok} — Hessian matches to 1e-6')

# Eigenvalues tell us curvature directions
evals, evecs = np.linalg.eigh(H_exact)
print(f'\nEigenvalues: {evals}  (both positive → local min)')
print(f'Condition number: {evals[-1]/evals[0]:.2f}')

Code cell 35

# === 8.2 Hessian Eigenstructure Visualization ===

try:
    import matplotlib.pyplot as plt
    import matplotlib
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

if HAS_MPL:
    COLORS = {'blue': '#2166AC', 'red': '#D6604D', 'green': '#4DAC26',
              'orange': '#F4A582', 'purple': '#762A83', 'gray': '#878787'}
    plt.style.use('seaborn-v0_8-whitegrid')
    plt.rcParams['figure.figsize'] = [10, 4]

    fig, axes = plt.subplots(1, 2)

    x = np.linspace(-2, 2, 300)
    y = np.linspace(-2, 2, 300)
    X, Y = np.meshgrid(x, y)
    Z = X**2 + 3*X*Y + 2*Y**2

    # Left: contours of the quadratic
    ax = axes[0]
    cs = ax.contourf(X, Y, Z, levels=20, cmap='viridis')
    plt.colorbar(cs, ax=ax)

    # Draw eigenvectors scaled by eigenvalue
    for i, (val, vec) in enumerate(zip(evals, evecs.T)):
        ax.annotate('', xy=vec*0.7, xytext=(0, 0),
                    arrowprops=dict(arrowstyle='->', color=list(COLORS.values())[i], lw=2))
        ax.text(vec[0]*0.8, vec[1]*0.8, f'λ={val:.1f}',
                fontsize=10, color=list(COLORS.values())[i], fontweight='bold')
    ax.set_title('Quadratic $f = x^2+3xy+2y^2$\nHessian eigenvectors')
    ax.set_xlabel('x'); ax.set_ylabel('y')

    # Right: Saddle point example
    f_saddle = lambda xy: xy[0]**2 - xy[1]**2
    Z2 = X**2 - Y**2
    H_saddle = np.array([[2., 0.], [0., -2.]])
    evals2, evecs2 = np.linalg.eigh(H_saddle)
    ax2 = axes[1]
    cs2 = ax2.contourf(X, Y, Z2, levels=20, cmap='RdBu_r')
    plt.colorbar(cs2, ax=ax2)
    ax2.set_title(f'Saddle $f=x^2-y^2$\nλ=({evals2[0]:.0f},{evals2[1]:.0f}) → saddle')
    ax2.set_xlabel('x'); ax2.set_ylabel('y')

    plt.tight_layout()
    plt.savefig('/tmp/hessian_curvature.png', dpi=100, bbox_inches='tight')
    plt.show()
    print('Hessian eigenstructure plots saved.')
else:
    print('matplotlib not available.')

9. Gradient Flow and Continuous-Time Optimization

The gradient descent iteration θt+1=θtηf(θt)\theta_{t+1} = \theta_t - \eta\,\nabla f(\theta_t) becomes, as η0\eta \to 0, the gradient flow ODE:

dθdt=f(θ(t))\frac{d\theta}{dt} = -\nabla f(\theta(t))

This ODE perspective gives analytical tools: for a strongly convex ff with constant μ>0\mu > 0, the flow satisfies f(θ(t))fe2μt[f(θ0)f]f(\theta(t)) - f^* \le e^{-2\mu t}[f(\theta_0) - f^*], an exponential decay. Modern analyses of neural collapse, loss landscape geometry, and sharpness-aware minimization all use gradient flow as the backbone model.

Code cell 37

# === 9.1 Gradient Flow ODE vs Discrete Gradient Descent ===

from scipy.integrate import solve_ivp

# f(x,y) = x^2 + 5y^2 (elongated bowl, condition number 5)
grad_f = lambda xy: np.array([2*xy[0], 10*xy[1]])
f_val  = lambda xy: xy[0]**2 + 5*xy[1]**2

# Continuous gradient flow
def flow_rhs(t, theta):
    return -grad_f(theta)

theta0 = np.array([2.0, 1.5])
sol = solve_ivp(flow_rhs, [0, 3], theta0, max_step=0.01, dense_output=True)
t_cont = np.linspace(0, 3, 300)
flow_path = sol.sol(t_cont).T

# Discrete gradient descent at two learning rates
def gradient_descent_path(theta0, lr, n_steps):
    path = [theta0.copy()]
    theta = theta0.copy()
    for _ in range(n_steps):
        theta = theta - lr * grad_f(theta)
        path.append(theta.copy())
    return np.array(path)

disc_01 = gradient_descent_path(theta0, lr=0.05, n_steps=60)
disc_05 = gradient_descent_path(theta0, lr=0.15, n_steps=60)

print(f'Gradient flow final loss: {f_val(flow_path[-1]):.2e}')
print(f'GD lr=0.05 final loss:    {f_val(disc_01[-1]):.2e}')
print(f'GD lr=0.15 final loss:    {f_val(disc_05[-1]):.2e}')

if HAS_MPL if 'HAS_MPL' in dir() else False:
    pass  # plot below in visualization cell
print('Gradient flow data computed.')

Code cell 38

# === 9.2 Visualize Gradient Flow vs Discrete GD ===

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

if HAS_MPL:
    COLORS = {'blue': '#2166AC', 'red': '#D6604D', 'green': '#4DAC26', 'orange': '#F4A582'}
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    x = np.linspace(-2.5, 2.5, 200)
    y = np.linspace(-2, 2, 200)
    X, Y = np.meshgrid(x, y)
    Z = X**2 + 5*Y**2

    # Trajectory plot
    ax = axes[0]
    ax.contourf(X, Y, Z, levels=15, cmap='viridis', alpha=0.4)
    ax.contour(X, Y, Z, levels=15, colors='white', linewidths=0.5, alpha=0.6)
    ax.plot(flow_path[:, 0], flow_path[:, 1], '-', color=COLORS['blue'],
            lw=2.5, label='Gradient flow (continuous)')
    ax.plot(disc_01[:, 0], disc_01[:, 1], 'o-', color=COLORS['red'],
            ms=3, lw=1.5, label='GD η=0.05')
    ax.plot(disc_05[:, 0], disc_05[:, 1], 's-', color=COLORS['orange'],
            ms=3, lw=1.5, label='GD η=0.15')
    ax.plot(*theta0, 'k*', ms=12, label='Start')
    ax.plot(0, 0, 'k+', ms=15, mew=3, label='Minimum')
    ax.set_xlabel('x'); ax.set_ylabel('y')
    ax.set_title('Trajectory: gradient flow vs discrete GD')
    ax.legend(fontsize=9)

    # Loss vs time/steps
    ax2 = axes[1]
    ax2.semilogy(t_cont, [f_val(p) for p in flow_path], '-', color=COLORS['blue'],
                 lw=2.5, label='Gradient flow')
    steps = np.arange(len(disc_01))
    ax2.semilogy(steps * 0.05, [f_val(p) for p in disc_01], 'o-', color=COLORS['red'],
                 ms=3, lw=1.5, label='GD η=0.05')
    ax2.semilogy(steps * 0.15, [f_val(p) for p in disc_05], 's-', color=COLORS['orange'],
                 ms=3, lw=1.5, label='GD η=0.15')
    ax2.set_xlabel('Time / effective time (lr × steps)')
    ax2.set_ylabel('Loss (log scale)')
    ax2.set_title('Loss convergence')
    ax2.legend(fontsize=9)

    plt.tight_layout()
    plt.savefig('/tmp/gradient_flow.png', dpi=100, bbox_inches='tight')
    plt.show()
    print('Gradient flow plot saved.')
else:
    print('matplotlib not available.')

10. Sparse Gradients and Coordinate-Wise Methods

In large embedding tables (LLMs, recommendation systems), most tokens in a batch do not appear — their gradient rows are exactly zero. This sparsity is exploited by:

  • Sparse optimizers that only update embedding rows with non-zero gradients
  • LoRA: instead of updating the full weight gradient WRm×n\nabla W \in \mathbb{R}^{m\times n}, parameterise the update as ΔW=AB\Delta W = AB with ARm×rA\in\mathbb{R}^{m\times r}, BRr×nB\in\mathbb{R}^{r\times n} and rmin(m,n)r \ll \min(m,n). The effective gradient lives in a low-rank subspace.

Coordinate descent updates one coordinate at a time:

θiθiηfθi\theta_i \leftarrow \theta_i - \eta \frac{\partial f}{\partial \theta_i}

For separable functions (e.g. f(θ)=ifi(θi)f(\theta) = \sum_i f_i(\theta_i)) this is exact; for general ff it converges under mild conditions. Block coordinate descent underpins many alternating optimisation algorithms.

Code cell 40

# === 10.1 Sparse Gradient: Embedding Table Update ===

import numpy as np
np.random.seed(42)

vocab_size, embed_dim = 50000, 768
batch_size, seq_len = 32, 128

# Simulate which token indices appear in a batch
token_ids = np.random.randint(0, vocab_size, size=(batch_size, seq_len))
unique_tokens = np.unique(token_ids)
sparsity = 1 - len(unique_tokens) / vocab_size
print(f'Vocab size:       {vocab_size:,}')
print(f'Unique tokens in batch: {len(unique_tokens):,}')
print(f'Gradient sparsity: {sparsity:.4f} ({sparsity*100:.1f}% rows are zero)')

# Dense vs sparse gradient update cost
dense_ops = vocab_size * embed_dim
sparse_ops = len(unique_tokens) * embed_dim
print(f'\nDense update: {dense_ops:,} scalar operations')
print(f'Sparse update: {sparse_ops:,} scalar operations')
print(f'Speedup: {dense_ops/sparse_ops:.1f}x')

# LoRA: low-rank gradient subspace
rank = 8
m, n = 4096, 4096
lora_params = rank * (m + n)
full_params = m * n
print(f'\nLoRA r={rank}: {lora_params:,} params vs full {full_params:,}')
print(f'Parameter reduction: {full_params/lora_params:.0f}x')

11. Memory vs Computation Trade-offs in Gradient Computation

Computing θL\nabla_\theta \mathcal{L} for a deep network requires storing all intermediate activations z(l)z^{(l)} during the forward pass so that the backward pass can use them.

For a Transformer with LL layers, NN tokens, hidden size dd:

MethodMemoryCompute
Full storageO(LNd)O(LNd)O(LNd)O(LNd)
Gradient checkpointing (every L\sqrt{L} layers)O(LNd)O(\sqrt{L}\,Nd)O(32LNd)O(\frac{3}{2}LNd)
Rematerialisation (no storage)O(Nd)O(Nd)O(L2Nd)O(L^2 Nd)

Gradient checkpointing (PyTorch: torch.utils.checkpoint) trades a 33% compute overhead for a L\sqrt{L} memory reduction — critical for training long-context models.

Automatic differentiation and its memory model: §05 — Automatic Differentiation

Code cell 42

# === 11.1 Gradient Statistics Across Training ===

import numpy as np
np.random.seed(42)

# Simulate gradient norms across training steps
# Typical pattern: high early, decreasing, occasional spikes
n_steps = 500
t = np.arange(n_steps)

# Base decaying norm + noise + occasional spikes
base = 2.0 * np.exp(-t / 200) + 0.1
noise = 0.05 * np.random.randn(n_steps)
spike_locs = np.random.choice(n_steps, size=15, replace=False)
spikes = np.zeros(n_steps)
spikes[spike_locs] = np.random.exponential(1.5, size=15)
grad_norms = np.abs(base + noise + spikes)

clip_threshold = 1.0
clipped_norms = np.minimum(grad_norms, clip_threshold)

print(f'Max gradient norm:     {grad_norms.max():.3f}')
print(f'Mean gradient norm:    {grad_norms.mean():.3f}')
print(f'Fraction clipped:      {(grad_norms > clip_threshold).mean():.3f}')
print(f'Mean after clipping:   {clipped_norms.mean():.3f}')

try:
    import matplotlib.pyplot as plt
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(10, 4))
    ax.plot(t, grad_norms, alpha=0.7, color='#D6604D', lw=1, label='Raw norm')
    ax.plot(t, clipped_norms, alpha=0.9, color='#2166AC', lw=1.5, label=f'Clipped (max={clip_threshold})')
    ax.axhline(clip_threshold, color='#4DAC26', ls='--', lw=1.5, label='Clip threshold')
    ax.set_xlabel('Training step'); ax.set_ylabel('‖∇‖₂')
    ax.set_title('Gradient norm during training: raw vs clipped')
    ax.legend()
    plt.tight_layout()
    plt.savefig('/tmp/grad_stats.png', dpi=100, bbox_inches='tight')
    plt.show()
    print('Gradient statistics plot saved.')
except ImportError:
    print('matplotlib not available.')

12. Wirtinger Calculus for Complex-Valued Functions

Modern neural networks use complex-valued operations in signal processing, MRI reconstruction, and some attention variants. For f:CnRf : \mathbb{C}^n \to \mathbb{R} (a real-valued function of complex variables), the standard derivative df/dzdf/dz does not exist unless ff is holomorphic — but real-valued losses typically are not.

Wirtinger derivatives split complex differentiation into:

fz=12(fxify),fzˉ=12(fx+ify)\frac{\partial f}{\partial z} = \frac{1}{2}\left(\frac{\partial f}{\partial x} - i\frac{\partial f}{\partial y}\right), \qquad \frac{\partial f}{\partial \bar{z}} = \frac{1}{2}\left(\frac{\partial f}{\partial x} + i\frac{\partial f}{\partial y}\right)

where z=x+iyz = x + iy. For real-valued ff, the gradient direction that decreases ff most steeply is f/zˉ-\partial f / \partial \bar{z} — the conjugate Wirtinger gradient. PyTorch uses this convention for complex tensors.

Code cell 44

# === 12.1 Wirtinger Gradient Demo ===

import numpy as np

# f(z) = |z - z0|^2 = (x-x0)^2 + (y-y0)^2
# Wirtinger: df/dz_bar = (x-x0) + i(y-y0) = z - z0
z0 = 1.0 + 2.0j
z  = 3.0 - 1.0j

f = lambda z: abs(z - z0)**2

# Numerical Wirtinger gradient: conjugate Wirtinger ∂f/∂z*
h = 1e-7
# ∂f/∂x ≈ (f(z+h) - f(z-h)) / (2h)
# ∂f/∂y ≈ (f(z+ih) - f(z-ih)) / (2h)
dfdx = (f(z + h) - f(z - h)) / (2*h)
dfdy = (f(z + 1j*h) - f(z - 1j*h)) / (2*h)

wirtinger_conj = 0.5 * (dfdx + 1j * dfdy)   # ∂f/∂z̄
exact_conj = z - z0                           # known closed form

print(f'Wirtinger ∂f/∂z̄ (numerical): {wirtinger_conj:.6f}')
print(f'Exact (z - z0):               {exact_conj:.6f}')
ok = abs(wirtinger_conj - exact_conj) < 1e-5
print(f'PASS: {ok} — numerical matches exact')

# Gradient step: move z in direction -(df/dz_bar)
lr = 0.5
z_new = z - lr * wirtinger_conj
print(f'\nz       = {z:.4f}, f(z)     = {f(z):.4f}')
print(f'z_new   = {z_new:.4f}, f(z_new) = {f(z_new):.4f}')
print(f'Minimum is at z0 = {z0:.4f}')

13. Summary and Connections

Key results from this notebook

ResultFormulaWhere Used
Gradient definitionf(x)=[f/x1,,f/xn]\nabla f(\mathbf{x}) = [\partial f/\partial x_1, \ldots, \partial f/\partial x_n]^\topEvery optimizer
Steepest ascentu=f/f\mathbf{u}^* = \nabla f / \|\nabla f\|Gradient descent direction
Directional derivativeDuf=fuD_{\mathbf{u}}f = \nabla f \cdot \mathbf{u}Sensitivity analysis
MSE gradientwL=2mX(Xwy)\nabla_\mathbf{w}\mathcal{L} = \frac{2}{m}X^\top(X\mathbf{w}-\mathbf{y})Linear regression
CE gradientwL=1mX(py)\nabla_\mathbf{w}\mathcal{L} = \frac{1}{m}X^\top(\mathbf{p}-\mathbf{y})Logistic regression, LLMs
Gradient checkingrelative error <105< 10^{-5}Debugging backprop
Gradient clippingg^=gmin(1,G/g)\hat{g} = g \cdot \min(1, G/\|g\|)Stable training

What comes next


14. Convergence Theory for Gradient Descent

For an LL-smooth convex function (f(x)f(y)Lxy\|\nabla f(x) - \nabla f(y)\| \le L\|x-y\|), gradient descent with step size η1/L\eta \le 1/L satisfies:

f(θt)fθ0θ22ηtf(\theta_t) - f^* \le \frac{\|\theta_0 - \theta^*\|^2}{2\eta t}

This O(1/t)O(1/t) rate is tight for convex functions. For μ\mu-strongly convex functions:

f(θt)f(1μL)t[f(θ0)f]f(\theta_t) - f^* \le \left(1 - \frac{\mu}{L}\right)^t [f(\theta_0) - f^*]

a geometric (linear) rate. The ratio κ=L/μ\kappa = L/\mu is the condition number — elongated loss landscapes (large κ\kappa) slow convergence dramatically. Preconditioning (Adam, Adagrad, natural gradient) aims to reduce effective κ\kappa.

Code cell 47

# === 14.1 Gradient Descent Convergence Rates ===

import numpy as np
np.random.seed(42)

# Quadratic f(x) = (1/2) x^T A x with A = diag(mu, L)
# Condition numbers to compare
conditions = [1, 5, 20, 100]
n_steps = 200
mu = 1.0

def gd_convergence(kappa, n_steps):
    L = kappa * mu
    eta = 1.0 / L  # optimal step size
    # Initial loss = 1.0 (normalised)
    # Linear convergence: loss_t = (1 - mu/L)^t
    t = np.arange(n_steps + 1)
    rate = 1 - mu / L
    return t, rate ** t

try:
    import matplotlib.pyplot as plt
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(10, 5))
    cmap = plt.cm.plasma
    colors = [cmap(i/len(conditions)) for i in range(len(conditions))]

    for kappa, color in zip(conditions, colors):
        t, losses = gd_convergence(kappa, n_steps)
        ax.semilogy(t, losses, lw=2, color=color, label=f'κ={kappa}, η=1/L')
        # Mark steps to reach 1e-4 precision
        idx = np.searchsorted(-losses, -1e-4)
        if idx < n_steps:
            ax.axvline(idx, color=color, lw=0.8, ls=':', alpha=0.7)

    ax.set_xlabel('Gradient descent step')
    ax.set_ylabel('$f(\\theta_t) - f^*$ (log scale)')
    ax.set_title('Convergence rate vs condition number κ = L/μ')
    ax.legend()
    ax.set_xlim(0, n_steps)
    plt.tight_layout()
    plt.savefig('/tmp/gd_convergence.png', dpi=100, bbox_inches='tight')
    plt.show()
    print('Convergence plot saved.')
except ImportError:
    print('matplotlib not available.')

# Analytical steps to 1e-4 precision
print('\nSteps to reach 1e-4 precision:')
for kappa in conditions:
    L = kappa * mu
    rate = 1 - mu / L
    steps = int(np.ceil(np.log(1e-4) / np.log(rate)))
    print(f'  κ={kappa:3d}: {steps:5d} steps  (rate={rate:.4f})')

Code cell 48

# === 14.2 Adaptive vs Standard Gradient Descent ===

import numpy as np
np.random.seed(42)

# Ill-conditioned quadratic: f(x,y) = 0.5*(x^2/0.01 + y^2) → condition number 100
def f_ill(xy): return 0.5 * (xy[0]**2 / 0.01 + xy[1]**2)
def grad_ill(xy): return np.array([xy[0] / 0.01, xy[1]])

theta0 = np.array([0.5, 1.0])
n_steps = 150

# Standard GD with η = 0.005 (safe for L=100)
def run_gd(theta0, lr, n_steps):
    theta = theta0.copy()
    path = [theta.copy()]
    for _ in range(n_steps):
        theta -= lr * grad_ill(theta)
        path.append(theta.copy())
    return np.array(path)

# Adam (diagonal preconditioning approximates the Hessian inverse)
def run_adam(theta0, lr, n_steps, b1=0.9, b2=0.999, eps=1e-8):
    theta = theta0.copy()
    m = np.zeros_like(theta)
    v = np.zeros_like(theta)
    path = [theta.copy()]
    for t in range(1, n_steps + 1):
        g = grad_ill(theta)
        m = b1 * m + (1 - b1) * g
        v = b2 * v + (1 - b2) * g**2
        m_hat = m / (1 - b1**t)
        v_hat = v / (1 - b2**t)
        theta -= lr * m_hat / (np.sqrt(v_hat) + eps)
        path.append(theta.copy())
    return np.array(path)

gd_path  = run_gd(theta0, lr=0.005, n_steps=n_steps)
adam_path = run_adam(theta0, lr=0.1, n_steps=n_steps)

gd_losses   = [f_ill(p) for p in gd_path]
adam_losses = [f_ill(p) for p in adam_path]

print(f'GD  final loss: {gd_losses[-1]:.6f}')
print(f'Adam final loss: {adam_losses[-1]:.6f}')
print(f'Adam speedup factor: ~{gd_losses[50]/adam_losses[50]:.0f}x at step 50')

try:
    import matplotlib.pyplot as plt
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    x = np.linspace(-0.6, 0.6, 200)
    y = np.linspace(-1.2, 1.2, 200)
    X, Y = np.meshgrid(x, y)
    Z = 0.5 * (X**2/0.01 + Y**2)
    ax = axes[0]
    ax.contourf(X, Y, Z, levels=20, cmap='viridis', alpha=0.5)
    ax.plot(gd_path[:,0], gd_path[:,1], 'o-', ms=2, lw=1.5, color='#D6604D', label='GD η=0.005')
    ax.plot(adam_path[:,0], adam_path[:,1], 's-', ms=2, lw=1.5, color='#2166AC', label='Adam lr=0.1')
    ax.set_xlabel('x'); ax.set_ylabel('y')
    ax.set_title('Trajectory (κ=100)')
    ax.legend(fontsize=9)
    axes[1].semilogy(gd_losses, color='#D6604D', lw=2, label='GD')
    axes[1].semilogy(adam_losses, color='#2166AC', lw=2, label='Adam')
    axes[1].set_xlabel('Step'); axes[1].set_ylabel('Loss (log)')
    axes[1].set_title('Loss convergence')
    axes[1].legend()
    plt.tight_layout()
    plt.savefig('/tmp/adam_vs_gd.png', dpi=100, bbox_inches='tight')
    plt.show()
    print('Adam vs GD plot saved.')
except ImportError:
    print('matplotlib not available.')

15. Stochastic Gradients and Mini-Batch Estimation

In practice, the true gradient θL(θ)=1Ni=1Nθi(θ)\nabla_\theta \mathcal{L}(\theta) = \frac{1}{N}\sum_{i=1}^N \nabla_\theta \ell_i(\theta) is computed over the full dataset — prohibitively expensive for N=1012N = 10^{12} tokens.

Stochastic gradient descent (SGD) uses a mini-batch estimate:

g^=1BiBθi(θ)\hat{g} = \frac{1}{|\mathcal{B}|}\sum_{i\in\mathcal{B}} \nabla_\theta \ell_i(\theta)

Statistical properties:

  • E[g^]=θL\mathbb{E}[\hat{g}] = \nabla_\theta \mathcal{L} — the mini-batch gradient is unbiased
  • Var[g^]=σ2B\text{Var}[\hat{g}] = \frac{\sigma^2}{|\mathcal{B}|} — variance decreases as 1/B1/B; larger batches give more accurate gradient estimates
  • Gradient noise acts as implicit regularisation, helping escape sharp minima

The signal-to-noise ratio SNR=E[g^]2/Var[g^]\text{SNR} = \|\mathbb{E}[\hat{g}]\|^2 / \text{Var}[\hat{g}] guides batch size selection — when SNR 1\ll 1, increasing batch size helps; when SNR 1\gg 1, larger batches add diminishing returns.

Code cell 50

# === 15.1 Mini-Batch Gradient Variance vs Batch Size ===

import numpy as np
np.random.seed(42)

# Dataset: N samples from a linear model y = 2x + noise
N = 10000
x_data = np.random.randn(N)
y_data = 2.0 * x_data + 0.5 * np.random.randn(N)

# True gradient of MSE w.r.t. w: (2/N) sum_i x_i*(x_i*w - y_i)
w = 1.0  # current weight estimate
true_grad = (2.0 / N) * np.sum(x_data * (w * x_data - y_data))

# Mini-batch gradient variance vs batch size
batch_sizes = [1, 8, 32, 128, 512, 2048]
n_trials = 500

print(f'True gradient: {true_grad:.6f}')
print(f'\nBatch size | Mean estimate | Std dev   | Variance reduction')
print('-' * 65)

grad_stds = []
for B in batch_sizes:
    estimates = []
    for _ in range(n_trials):
        idx = np.random.choice(N, size=B, replace=False)
        xb, yb = x_data[idx], y_data[idx]
        g = (2.0 / B) * np.sum(xb * (w * xb - yb))
        estimates.append(g)
    mean_g = np.mean(estimates)
    std_g  = np.std(estimates)
    grad_stds.append(std_g)
    print(f'B={B:5d}     | {mean_g:+.6f}    | {std_g:.6f} | {(grad_stds[0]/std_g)**2:.1f}x')

# Verify variance scales as 1/B
ratio = grad_stds[0] / grad_stds[-1]
expected_ratio = np.sqrt(batch_sizes[-1] / batch_sizes[0])
ok = abs(ratio - expected_ratio) / expected_ratio < 0.15
print(f'\nPASS: {ok} — variance scales as 1/B (std ratio={ratio:.2f}, expected≈{expected_ratio:.2f})')

16. Notebook Summary

This notebook covered the complete theory and computation of partial derivatives and gradients, from first principles to modern ML applications:

Topics covered

SectionKey resultCode demo
Scalar fieldsLevel sets, topology of loss landscapes3D surface plots
Partial derivativesLimit definition, finite difference convergenceError vs h log-log
GradientStandard forms, steepest ascent proofQuiver fields
Directional derivativesMaximised by ∇f directionPolar angle sweep
ML gradientsMSE and CE gradient derivationsLogistic regression training
Gradient descentTrajectory visualisation, multiple learning rates2D landscape
Numerical differentiationForward vs centred differences, optimal hError curves
Gradient checkingRelative error < 1e-5 for correct gradients5 test functions
Loss landscapeBeale function, multiple basins, GD trajectoriesNon-convex plots
Gradient clippingNorm-based clipping demoTraining stability
Natural gradientFIM-preconditioned update for Gaussian MLETrajectory comparison
Softmax JacobianDiagonal - outer product formulaCE gradient verification
Hessian (preview)Eigenstructure, curvature directionsEigenvector overlay
Gradient flowContinuous-time ODE vs discrete GDTrajectory + loss curves
Sparse gradientsEmbedding table update, LoRA parameterisationCost comparison
Wirtinger calculusComplex gradient for real-valued losses
Convergence theoryO(1/t) convex, linear strongly convexκ vs steps plot
Adam vs GDAdaptive preconditioning on ill-conditioned quadraticTrajectory plot
SGD variance1/B scaling of mini-batch gradient noiseBatch size sweep

Next steps