Theory Notebook

Converted from theory.ipynb for web reading.

Derivatives and Differentiation

"The derivative is the instantaneous rate of change — and backpropagation is just the chain rule applied to a computation graph."

Interactive theory notebook covering: derivative definition, differentiation rules, chain rule, activation function derivatives, numerical differentiation, and optimization foundations.

Companion: notes.md | exercises.ipynb

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import integrate, special, stats
from math import factorial
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok



def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def forward_diff(f, x, h=1e-6):
    return (f(x + h) - f(x)) / h

def backward_diff(f, x, h=1e-6):
    return (f(x) - f(x - h)) / h



def grad_check(f, x, analytic_grad, h=1e-6):
    x = np.asarray(x, dtype=float)
    analytic_grad = np.asarray(analytic_grad, dtype=float)
    numeric_grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        x_plus = x.copy(); x_minus = x.copy()
        x_plus[idx] += h; x_minus[idx] -= h
        numeric_grad[idx] = (f(x_plus) - f(x_minus)) / (2 * h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

print("Chapter helper setup complete.")

1. Intuition — The Derivative as Instantaneous Rate of Change

The derivative measures how fast a function changes at a point. We visualize the secant-to-tangent limit and explore what the derivative means geometrically.

Code cell 5

# === 1.1 Secant Lines Converging to Tangent ===

def f(x):
    return x**2

def f_prime(x):
    return 2*x

a = 1.0  # Point of tangency
h_values = [1.0, 0.5, 0.2, 0.05]

x = np.linspace(-0.5, 2.5, 300)

if HAS_MPL:
    colors = plt.cm.plasma(np.linspace(0.2, 0.9, len(h_values)))
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    ax = axes[0]
    ax.plot(x, f(x), 'k-', lw=2, label='$f(x) = x^2$')
    # Secant lines
    for i, h in enumerate(h_values):
        slope = (f(a+h) - f(a)) / h
        x_line = np.linspace(a-0.3, a+h+0.3, 50)
        y_line = f(a) + slope*(x_line - a)
        ax.plot(x_line, y_line, '--', color=colors[i], lw=1.5,
                label=f'h={h:.2f}, slope={slope:.2f}')
    # Tangent
    x_tang = np.linspace(a-0.8, a+0.8, 50)
    ax.plot(x_tang, f(a) + f_prime(a)*(x_tang - a), 'r-', lw=2.5,
            label=f"Tangent: slope={f_prime(a):.1f}")
    ax.plot(a, f(a), 'ro', ms=8)
    ax.set_xlim(-0.2, 2.2); ax.set_ylim(-0.2, 4.5)
    ax.set_xlabel('x'); ax.set_ylabel('f(x)')
    ax.set_title('Secant Lines → Tangent Line')
    ax.legend(fontsize=9)

    # Right panel: slope vs h
    ax2 = axes[1]
    h_fine = np.logspace(-4, 0, 200)
    slopes = [(f(a+h) - f(a)) / h for h in h_fine]
    ax2.semilogx(h_fine, slopes, 'b-', lw=2, label='Secant slope $(f(1+h)-f(1))/h$')
    ax2.axhline(f_prime(a), color='r', ls='--', lw=2, label=f"True $f'(1)$ = {f_prime(a)}")
    ax2.set_xlabel('Step size h'); ax2.set_ylabel('Slope')
    ax2.set_title('Convergence of Secant to Tangent')
    ax2.legend()

    plt.tight_layout()
    plt.savefig('/tmp/secant_tangent.png', dpi=100, bbox_inches='tight')
    plt.show()
    print('Figure saved.')

# Verify derivative
h = 1e-7
numerical = (f(1+h) - f(1)) / h
analytic = f_prime(1)
print(f'Numerical f\'(1): {numerical:.8f}')
print(f'Analytic  f\'(1): {analytic:.8f}')
print(f'PASS - error < 1e-6: {abs(numerical - analytic) < 1e-6}')

2. Formal Definition — Derivative as a Limit

f'(a) = \lim_{h \to 0} \frac{f(a+h) - f(a)}{h}

We compute this limit explicitly for several functions and verify symbolically.

Code cell 7

# === 2.1 Derivative from Definition — Worked Examples ===

# Example 1: f(x) = x^2, f'(x) = 2x
def limit_x2(x, h):
    return ((x+h)**2 - x**2) / h

# Example 2: f(x) = sqrt(x), f'(x) = 1/(2*sqrt(x))
def limit_sqrt(x, h):
    return (np.sqrt(x+h) - np.sqrt(x)) / h

# Example 3: f(x) = e^x, f'(x) = e^x
def limit_exp(x, h):
    return (np.exp(x+h) - np.exp(x)) / h

x0 = 2.0
h_vals = [0.1, 0.01, 0.001, 1e-6]

print('=== Convergence of Difference Quotients ===')
print(f'\nf(x) = x^2 at x={x0}  (true f\'={2*x0})')
for h in h_vals:
    print(f'  h={h:.0e}: {limit_x2(x0, h):.8f}')

print(f'\nf(x) = sqrt(x) at x={x0}  (true f\'={1/(2*np.sqrt(x0)):.8f})')
for h in h_vals:
    print(f'  h={h:.0e}: {limit_sqrt(x0, h):.8f}')

print(f'\nf(x) = e^x at x={x0}  (true f\'={np.exp(x0):.8f})')
for h in h_vals:
    print(f'  h={h:.0e}: {limit_exp(x0, h):.8f}')

Code cell 8

# === 2.2 Differentiable vs Continuous — ReLU and |x| ===

x = np.linspace(-2, 2, 400)

relu = np.maximum(0, x)
abs_x = np.abs(x)

# Difference quotients at x=0
h_vals = [0.5, 0.1, 0.01, 0.001]
print('=== Left and Right Derivatives at x=0 ===')
print('\nReLU:')
for h in h_vals:
    left = (np.maximum(0, -h) - 0) / (-h)
    right = (np.maximum(0, h) - 0) / h
    print(f'  h={h:.3f}: left={left:.3f}, right={right:.3f}')
print('  → right limit = 1, left limit = 0 → NOT differentiable (subgradient ∈ [0,1])')

print('\n|x|:')
for h in h_vals:
    left = (abs(-h) - 0) / (-h)
    right = (abs(h) - 0) / h
    print(f'  h={h:.3f}: left={left:.3f}, right={right:.3f}')
print('  → right limit = 1, left limit = -1 → NOT differentiable (corner)')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    for ax, func, name in [(axes[0], relu, 'ReLU(x)'), (axes[1], abs_x, '|x|')]:
        ax.plot(x, func, 'b-', lw=2, label=name)
        ax.axvline(0, color='r', ls='--', alpha=0.5, label='x=0 (non-diff.)')
        ax.set_xlabel('x'); ax.set_ylabel('f(x)')
        ax.set_title(f'{name} — continuous but not differentiable at 0')
        ax.legend()
    plt.tight_layout(); plt.show()

3. Basic Differentiation Rules

The power, product, quotient, and sum rules let us differentiate any algebraic expression without computing limits. We verify each rule numerically.

Code cell 10

# === 3.1-3.5 Differentiation Rules — Numerical Verification ===

def centered_diff(f, x, h=1e-5):
    return (f(x+h) - f(x-h)) / (2*h)

def check(name, analytic_fn, numerical_fn, x):
    a = analytic_fn(x)
    n = numerical_fn(x)
    ok = abs(a - n) < 1e-6
    print(f"{'PASS' if ok else 'FAIL'} {name}: analytic={a:.8f}, numerical={n:.8f}")

x = 1.5
print('=== Derivative Rules — Verification at x=1.5 ===')

# Power rule: (x^5)' = 5x^4
check('Power rule x^5', lambda x: 5*x**4, lambda x: centered_diff(lambda t: t**5, x), x)

# Product rule: (x^2 * sin(x))' = 2x*sin(x) + x^2*cos(x)
check('Product rule x^2*sin(x)', 
      lambda x: 2*x*np.sin(x) + x**2*np.cos(x),
      lambda x: centered_diff(lambda t: t**2*np.sin(t), x), x)

# Quotient rule: (x^2/(x+1))' = (2x(x+1) - x^2)/(x+1)^2 = (x^2+2x)/(x+1)^2
check('Quotient rule x^2/(x+1)',
      lambda x: (x**2 + 2*x) / (x+1)**2,
      lambda x: centered_diff(lambda t: t**2/(t+1), x), x)

# Chain rule: sin(x^3)' = cos(x^3)*3x^2
check('Chain rule sin(x^3)',
      lambda x: np.cos(x**3)*3*x**2,
      lambda x: centered_diff(lambda t: np.sin(t**3), x), x)

# Logarithm: (ln(x))' = 1/x
check('Logarithm (ln x)',
      lambda x: 1/x,
      lambda x: centered_diff(np.log, x), x)

# Composite: e^(-x^2/2) - Gaussian kernel
check('Gaussian e^(-x^2/2)',
      lambda x: -x*np.exp(-x**2/2),
      lambda x: centered_diff(lambda t: np.exp(-t**2/2), x), x)

4. The Chain Rule

$(f \circ g)'(x) = f'(g(x)) \cdot g'(x)$

The chain rule is the mathematical foundation of backpropagation. Every node in a computation graph applies this rule to propagate gradients.

Code cell 12

# === 4.1-4.3 Chain Rule — Composition Analysis ===

def centered_diff(f, x, h=1e-6):
    return (f(x+h) - f(x-h)) / (2*h)

x_vals = np.linspace(-2, 2, 200)
x = 1.0

# Example 1: y = e^(-x^2/2) — Gaussian
f1 = lambda t: np.exp(-t**2/2)
f1_prime_analytic = lambda t: -t * np.exp(-t**2/2)
err1 = abs(centered_diff(f1, x) - f1_prime_analytic(x))
print(f'Gaussian derivative error: {err1:.2e}')

# Example 2: y = sin(ln(x)) for x>0
f2 = lambda t: np.sin(np.log(t))
f2_prime_analytic = lambda t: np.cos(np.log(t)) / t
err2 = abs(centered_diff(f2, 2.0) - f2_prime_analytic(2.0))
print(f'sin(ln(x)) derivative error: {err2:.2e}')

# Example 3: y = (1+x^2)^10
f3 = lambda t: (1+t**2)**10
f3_prime_analytic = lambda t: 20*t*(1+t**2)**9
err3 = abs(centered_diff(f3, x) - f3_prime_analytic(x))
print(f'(1+x^2)^10 derivative error: {err3:.2e}')

print('\nAll chain rule verifications PASS' if max(err1,err2,err3) < 1e-5 else 'CHECK FAILED')

# Scalar backprop demo
print('\n=== Scalar Backpropagation Demo ===')
print('Network: L = (sigma(w*x+b) - y)^2 / 2')
w, b, x_in, y = 2.0, -1.0, 1.5, 0.8

# Forward pass
z = w*x_in + b
sigma_z = 1 / (1 + np.exp(-z))
L = 0.5 * (sigma_z - y)**2

# Backward pass
dL_da = sigma_z - y
da_dz = sigma_z * (1 - sigma_z)
dL_dz = dL_da * da_dz
dL_dw = dL_dz * x_in
dL_db = dL_dz * 1

# Numerical check
num_dw = (0.5*(1/(1+np.exp(-(w+1e-5)*x_in+b))-y)**2 - 0.5*(1/(1+np.exp(-(w-1e-5)*x_in+b))-y)**2) / 2e-5
print(f'dL/dw — analytic: {dL_dw:.8f}, numerical: {num_dw:.8f}')
print(f'PASS: {abs(dL_dw - num_dw) < 1e-5}')

5. Implicit Differentiation and Linear Approximation

When $y$ is defined implicitly by $F(x,y)=0$ , differentiate both sides w.r.t. $x$ treating $y=y(x)$ . Linear approximation $f(x)\approx f(a)+f'(a)(x-a)$ is the derivative's most direct application.

Code cell 14

# === 5.1-5.3 Implicit Differentiation and Linear Approximation ===

import numpy as np

# --- Implicit: circle x^2 + y^2 = r^2 ---
# dy/dx = -x/y
r = 3.0
theta = np.linspace(0, 2*np.pi, 400)
x_circ = r * np.cos(theta)
y_circ = r * np.sin(theta)

# At point (x0, y0) on upper semicircle
x0 = 2.0
y0 = np.sqrt(r**2 - x0**2)
slope_implicit = -x0 / y0

# Verify numerically: y = sqrt(r^2 - x^2), dy/dx = -x/sqrt(r^2-x^2)
slope_numerical = (-x0 / np.sqrt(r**2 - x0**2))
print(f'Circle implicit slope at ({x0:.1f}, {y0:.3f}): {slope_implicit:.6f}')
print(f'Numerical check: {slope_numerical:.6f}')
print(f'PASS: {abs(slope_implicit - slope_numerical) < 1e-10}')

# --- Linear approximation demo ---
print('\n=== Linear Approximation: sin(x) near x=pi/4 ===')
a = np.pi / 4
f_a = np.sin(a)
f_prime_a = np.cos(a)

x_test_vals = [a + dx for dx in [-0.3, -0.1, 0.1, 0.3]]
print(f'Linearization: f(x) ≈ {f_a:.4f} + {f_prime_a:.4f}*(x - {a:.4f})')
print(f'{"x":>8} {"True f(x)":>12} {"Linear approx":>14} {"Error":>10}')
for x in x_test_vals:
    true_val = np.sin(x)
    linear_val = f_a + f_prime_a*(x - a)
    error = abs(true_val - linear_val)
    print(f'{x:8.4f} {true_val:12.6f} {linear_val:14.6f} {error:10.2e}')

# Cross-entropy gradient via related rates
print('\n=== Cross-Entropy Gradient via Chain Rule ===')
z = 1.5
sigma_z = 1/(1+np.exp(-z))
# L = -log(sigma(z)), dL/dz = sigma(z) - 1
dL_dz_analytic = sigma_z - 1

h = 1e-6
L_plus = -np.log(1/(1+np.exp(-(z+h))))
L_minus = -np.log(1/(1+np.exp(-(z-h))))
dL_dz_numerical = (L_plus - L_minus) / (2*h)
print(f'dL/dz = sigma(z)-1: analytic={dL_dz_analytic:.8f}, numerical={dL_dz_numerical:.8f}')
print(f'PASS: {abs(dL_dz_analytic - dL_dz_numerical) < 1e-7}')

6. Higher-Order Derivatives and Concavity

$f''(x) > 0$ means concave up; $f''(x) < 0$ means concave down. The second derivative determines local curvature of the loss surface.

Code cell 16

# === 6.1-6.4 Higher-Order Derivatives ===

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

x = np.linspace(-3, 3, 500)

# f = x^4 - 4x^2 (W-shape — two minima)
f_vals = x**4 - 4*x**2
fp_vals = 4*x**3 - 8*x
fpp_vals = 12*x**2 - 8

# Critical points: f' = 0 => 4x^3 - 8x = 4x(x^2-2) = 0 => x in {0, ±sqrt(2)}
crit = [0, np.sqrt(2), -np.sqrt(2)]
print('=== Critical Points of f(x) = x^4 - 4x^2 ===')
for c in crit:
    fpc = 4*c**3 - 8*c
    fppc = 12*c**2 - 8
    kind = 'local min' if fppc > 0 else ('local max' if fppc < 0 else 'inconclusive')
    print(f"  x={c:+.4f}: f={c**4-4*c**2:.4f}, f_prime={fpc:.4f}, f_double_prime={fppc:.4f} -> {kind}")

# Inflection points: f'' = 0 => 12x^2 = 8 => x = ±sqrt(2/3)
infl = np.sqrt(2/3)
print(f'\nInflection points at x = ±{infl:.4f}')

if HAS_MPL:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    for ax, vals, name, color in zip(axes,
        [f_vals, fp_vals, fpp_vals],
        ['f(x) = x⁴ - 4x²', "f'(x)", "f''(x)"],
        ['blue', 'green', 'red']):
        ax.plot(x, vals, color=color, lw=2, label=name)
        ax.axhline(0, color='k', lw=0.8)
        ax.set_xlim(-3, 3); ax.set_ylim(-5, 10)
        ax.set_xlabel('x'); ax.set_title(name)
        ax.legend()
    plt.suptitle('Function, First, and Second Derivative', fontsize=13)
    plt.tight_layout(); plt.show()

7. Extrema, Critical Points, and the Mean Value Theorem

The MVT guarantees a point where the instantaneous rate equals the average rate. This underlies Lipschitz bounds and gradient descent convergence theory.

Code cell 18

# === 7.1-7.4 Extrema and MVT ===

# --- Global extrema on closed interval ---
def f(x):
    return x**3 - 3*x

def fp(x):
    return 3*x**2 - 3

a, b = -2.0, 2.5

# Critical points in (a,b): f'(x) = 0 => x = ±1
crit = [-1.0, 1.0]
candidates = [a] + [c for c in crit if a < c < b] + [b]

print('=== Global Extrema of f(x) = x^3 - 3x on [-2, 2.5] ===')
print(f'{"Point":>8} {"f(x)":>10} {"Type":>15}')
f_vals = [(x, f(x)) for x in candidates]
min_x = min(f_vals, key=lambda p: p[1])
max_x = max(f_vals, key=lambda p: p[1])
for pt, val in f_vals:
    kind = 'Global MIN' if pt == min_x[0] else ('Global MAX' if pt == max_x[0] else 'Candidate')
    print(f'{pt:8.3f} {val:10.4f} {kind:>15}')

# --- MVT illustration ---
print('\n=== Mean Value Theorem ===')
x_mvt = np.linspace(a, b, 1000)
avg_slope = (f(b) - f(a)) / (b - a)
print(f'Average slope [f({b})-f({a})] / ({b}-{a}) = {avg_slope:.4f}')

# Find c where f'(c) = avg_slope: 3c^2 - 3 = avg_slope => c = sqrt((avg_slope+3)/3)
c_mvt = np.sqrt((avg_slope + 3) / 3)
print(f'MVT guarantees c in ({a},{b}) where f\'(c) = {avg_slope:.4f}')
print(f'Found c = {c_mvt:.4f}, f\'({c_mvt:.4f}) = {fp(c_mvt):.4f}')
print(f'PASS — MVT c in interval: {a < c_mvt < b}')

8. Activation Function Derivatives

Activation functions and their derivatives control gradient flow in neural networks. The vanishing gradient problem arises from small $|f'|$ values compounding through layers.

Code cell 20

# === 8.1-8.5 Activation Derivatives ===

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

x = np.linspace(-5, 5, 500)

# Sigmoid
def sigmoid(x): return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def sigmoid_prime(x): s = sigmoid(x); return s * (1 - s)

# Tanh
def tanh_prime(x): return 1 - np.tanh(x)**2

# ReLU
def relu(x): return np.maximum(0, x)
def relu_prime(x): return np.where(np.asarray(x) > 0, 1.0, 0.0)

# GELU
from scipy.special import erf
def gelu(x): return x * 0.5 * (1 + erf(x / np.sqrt(2)))
def gelu_prime(x):
    Phi = 0.5 * (1 + erf(x / np.sqrt(2)))
    phi = np.exp(-x**2/2) / np.sqrt(2*np.pi)
    return Phi + x * phi

# Numerical verification of derivatives at x=1.5
x0 = 1.5
h = 1e-6
def num_deriv(f, x): return (f(x+h) - f(x-h)) / (2*h)

print('=== Activation Derivative Verification at x=1.5 ===')
tests = [
    ('Sigmoid', sigmoid, sigmoid_prime),
    ('Tanh', np.tanh, tanh_prime),
    ('ReLU', relu, relu_prime),
    ('GELU', gelu, gelu_prime),
]
for name, fn, fn_prime in tests:
    a = fn_prime(x0)
    n = num_deriv(fn, x0)
    ok = abs(a - n) < 1e-5
    print(f"  {'PASS' if ok else 'FAIL'} {name}: analytic={a:.6f}, numerical={n:.6f}")

# Vanishing gradient analysis
print('\n=== Vanishing Gradient — Sigmoid Product Through L Layers ===')
z_typical = 2.0  # typical saturated input
sig_prime_val = sigmoid_prime(z_typical)
print(f'sigma\'({z_typical}) = {sig_prime_val:.6f}')
for L in [5, 10, 20, 50]:
    product = sig_prime_val**L
    print(f'  {L} layers: (sigma\')^{L} = {product:.2e}')

Code cell 21

# === 8.5 Activation Derivatives — Visualization ===

try:
    import matplotlib.pyplot as plt
    from scipy.special import erf
    HAS_MPL = True
except:
    HAS_MPL = False

import numpy as np

def sigmoid(x): return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def sigmoid_prime(x): s = sigmoid(x); return s * (1 - s)
def relu_prime(x): return np.where(np.asarray(x) > 0, 1.0, 0.0)
def gelu(x): return x * 0.5 * (1 + erf(x / np.sqrt(2)))
def gelu_prime(x):
    Phi = 0.5 * (1 + erf(x / np.sqrt(2)))
    phi = np.exp(-x**2/2) / np.sqrt(2*np.pi)
    return Phi + x * phi

x = np.linspace(-4, 4, 400)

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Left: activation functions
    ax = axes[0]
    ax.plot(x, sigmoid(x), label='Sigmoid', lw=2)
    ax.plot(x, np.tanh(x), label='Tanh', lw=2)
    ax.plot(x, np.maximum(0, x), label='ReLU', lw=2)
    ax.plot(x, gelu(x), label='GELU', lw=2, ls='--')
    ax.set_xlabel('x'); ax.set_ylabel('f(x)')
    ax.set_title('Activation Functions')
    ax.legend(); ax.set_ylim(-1.5, 4)

    # Right: derivatives
    ax2 = axes[1]
    ax2.plot(x, sigmoid_prime(x), label="Sigmoid'", lw=2)
    ax2.plot(x, 1 - np.tanh(x)**2, label="Tanh'", lw=2)
    ax2.plot(x, relu_prime(x), label="ReLU'", lw=2)
    ax2.plot(x, gelu_prime(x), label="GELU'", lw=2, ls='--')
    ax2.axhline(0.25, color='gray', ls=':', lw=1, label='1/4 (sigmoid max)')
    ax2.set_xlabel('x'); ax2.set_ylabel("f'(x)")
    ax2.set_title('Activation Derivatives')
    ax2.legend(); ax2.set_ylim(-0.2, 1.2)

    plt.tight_layout(); plt.show()
    print('Activation derivatives plotted.')
print('Peak gradient — Sigmoid:', f"{max(sigmoid_prime(x)):.4f} at x=0")
print('Peak gradient — Tanh:   ', f"{max(1-np.tanh(x)**2):.4f} at x=0")
print('Constant grad — ReLU:  1.0 for x>0 (no saturation)')

9. Numerical Differentiation

Finite differences approximate derivatives numerically. Two errors compete: truncation (decreases with $h$ ) and round-off (increases as $h\to 0$ ). The centered difference $O(h^2)$ is strongly preferred over the forward difference $O(h)$ .

Code cell 23

# === 9.1-9.2 Finite Differences — Error Analysis ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

def f(x): return np.sin(x)
def f_prime(x): return np.cos(x)

x0 = 1.0
true_deriv = f_prime(x0)

h_vals = np.logspace(-16, 0, 160)

forward_errors = []
centered_errors = []

for h in h_vals:
    fwd = (f(x0+h) - f(x0)) / h
    ctr = (f(x0+h) - f(x0-h)) / (2*h)
    forward_errors.append(abs(fwd - true_deriv))
    centered_errors.append(abs(ctr - true_deriv))

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.loglog(h_vals, forward_errors, 'b-', lw=2, label='Forward diff O(h)')
    ax.loglog(h_vals, centered_errors, 'r-', lw=2, label='Centered diff O(h²)')
    # Reference slopes
    h_ref = np.array([1e-8, 1e-4])
    ax.loglog(h_ref, h_ref, 'b--', alpha=0.5, label='O(h) slope')
    ax.loglog(h_ref, h_ref**2 * 1e4, 'r--', alpha=0.5, label='O(h²) slope')
    ax.set_xlabel('Step size h')
    ax.set_ylabel('|Error|')
    ax.set_title('Finite Difference Error vs Step Size (sin(x) at x=1)')
    ax.legend()
    plt.tight_layout(); plt.show()

# Find optimal h
opt_fwd_idx = np.argmin(forward_errors)
opt_ctr_idx = np.argmin(centered_errors)
print(f'Optimal h for forward diff: {h_vals[opt_fwd_idx]:.2e} (error: {forward_errors[opt_fwd_idx]:.2e})')
print(f'Optimal h for centered diff: {h_vals[opt_ctr_idx]:.2e} (error: {centered_errors[opt_ctr_idx]:.2e})')
print(f'Theoretical optimal centered: ~6e-6 (eps_mach^(1/3))')

Code cell 24

# === 9.3 Gradient Checking — Verifying Analytical Gradients ===

import numpy as np

def grad_check_scalar(f, x, analytic_grad, h=1e-5):
    """Check gradient of scalar-to-scalar function f."""
    numerical = (f(x+h) - f(x-h)) / (2*h)
    rel_error = abs(analytic_grad - numerical) / (abs(analytic_grad) + abs(numerical) + 1e-10)
    return rel_error, numerical

def grad_check_vector(f, x, analytic_grad, h=1e-5):
    """Check gradient of vector-to-scalar function f."""
    n = len(x)
    numerical_grad = np.zeros(n)
    for i in range(n):
        x_plus = x.copy(); x_plus[i] += h
        x_minus = x.copy(); x_minus[i] -= h
        numerical_grad[i] = (f(x_plus) - f(x_minus)) / (2*h)
    rel_error = np.linalg.norm(analytic_grad - numerical_grad) / \
                (np.linalg.norm(analytic_grad) + np.linalg.norm(numerical_grad) + 1e-10)
    return rel_error, numerical_grad

print('=== Gradient Checking Examples ===')

# Example 1: f(x) = x*ln(x+e^(-x))
def f1(x): return x * np.log(x + np.exp(-x))
def f1_prime(x): return np.log(x + np.exp(-x)) + x * (1 - np.exp(-x)) / (x + np.exp(-x))

x_test = 1.5
err1, num1 = grad_check_scalar(f1, x_test, f1_prime(x_test))
print(f'f(x) = x*ln(x+e^-x): analytic={f1_prime(x_test):.6f}, numerical={num1:.6f}')
print(f'  Relative error: {err1:.2e} → {"PASS" if err1 < 1e-5 else "FAIL"}')

# Example 2: L2 loss gradient
def loss(w):
    x = np.array([1.0, 2.0, -1.0])
    y = 1.5
    return 0.5 * (np.dot(w, x) - y)**2

def loss_grad(w):
    x = np.array([1.0, 2.0, -1.0])
    y = 1.5
    return (np.dot(w, x) - y) * x

w = np.array([0.5, -0.3, 1.2])
err2, num_grad = grad_check_vector(loss, w, loss_grad(w))
print(f'\nL2 loss gradient:')
print(f'  Analytic: {loss_grad(w)}')
print(f'  Numerical: {num_grad}')
print(f'  Relative error: {err2:.2e} → {"PASS" if err2 < 1e-5 else "FAIL"}')

10. Key Proofs — Interactive Verification

We verify the main derivative proofs computationally, checking that the theoretical formulas match numerical estimates to machine precision.

Code cell 26

# === 10. Proof Verification ===

import numpy as np

def centered(f, x, h=1e-7):
    return (f(x+h) - f(x-h)) / (2*h)

print('=== Verifying All Main Derivative Rules ===')

tests = [
    # (name, analytic_fn, numerical_fn, x)
    ('Power x^7', lambda x: 7*x**6, lambda x: centered(lambda t: t**7, x), 1.5),
    ('sin(x)', np.cos, lambda x: centered(np.sin, x), 1.2),
    ('cos(x)', lambda x: -np.sin(x), lambda x: centered(np.cos, x), 0.7),
    ('e^x', np.exp, lambda x: centered(np.exp, x), 2.0),
    ('ln(x)', lambda x: 1/x, lambda x: centered(np.log, x), 3.0),
    ('arctan(x)', lambda x: 1/(1+x**2), lambda x: centered(np.arctan, x), 1.0),
    ('arcsin(x)', lambda x: 1/np.sqrt(1-x**2), lambda x: centered(np.arcsin, x), 0.5),
    ('x^x (log diff)', lambda x: x**x*(np.log(x)+1), lambda x: centered(lambda t: t**t, x), 2.0),
    ('x^2*e^x (product)', lambda x: x**2*np.exp(x) + 2*x*np.exp(x), 
                          lambda x: centered(lambda t: t**2*np.exp(t), x), 1.5),
    ('sin(e^x) (chain)', lambda x: np.cos(np.exp(x))*np.exp(x),
                          lambda x: centered(lambda t: np.sin(np.exp(t)), x), 0.5),
    ('Softplus derivative=sigma', lambda x: 1/(1+np.exp(-x)),
                                   lambda x: centered(lambda t: np.log(1+np.exp(t)), x), 1.0),
]

all_pass = True
for name, analytic, numerical, x in tests:
    a = analytic(x); n = numerical(x)
    err = abs(a - n) / (abs(a) + 1e-10)
    ok = err < 1e-5
    all_pass = all_pass and ok
    print(f"  {'PASS' if ok else 'FAIL'} {name}: err={err:.2e}")

print(f'\nAll tests: {"PASS" if all_pass else "SOME FAILED"}')

11. Application — Adam Optimizer Analysis

The Adam optimizer uses first and second derivative moments to adapt learning rates. We simulate Adam on a quadratic and visualize convergence compared to vanilla SGD.

Code cell 28

# === 11. Adam Optimizer — Single Variable Simulation ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

# Loss: f(theta) = (theta - 3)^2 + 0.1*sin(5*theta)
def loss(theta): return (theta - 3)**2 + 0.1*np.sin(5*theta)
def grad(theta): return 2*(theta - 3) + 0.5*np.cos(5*theta)

def run_sgd(theta0, lr=0.1, steps=200):
    theta = theta0; trajectory = [theta0]
    for _ in range(steps):
        theta -= lr * grad(theta)
        trajectory.append(theta)
    return np.array(trajectory)

def run_adam(theta0, lr=0.1, b1=0.9, b2=0.999, eps=1e-8, steps=200):
    theta = theta0; m = 0.0; v = 0.0
    trajectory = [theta0]
    for t in range(1, steps+1):
        g = grad(theta)
        m = b1*m + (1-b1)*g
        v = b2*v + (1-b2)*g**2
        m_hat = m / (1 - b1**t)
        v_hat = v / (1 - b2**t)
        theta -= lr * m_hat / (np.sqrt(v_hat) + eps)
        trajectory.append(theta)
    return np.array(trajectory)

theta0 = -2.0
sgd_traj = run_sgd(theta0, lr=0.1)
adam_traj = run_adam(theta0, lr=0.5)

print('=== Optimizer Comparison ===')
print(f'True minimum near theta=3 (ignoring sin perturbation)')
print(f'SGD  final theta: {sgd_traj[-1]:.6f}, loss: {loss(sgd_traj[-1]):.6f}')
print(f'Adam final theta: {adam_traj[-1]:.6f}, loss: {loss(adam_traj[-1]):.6f}')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    steps = np.arange(len(sgd_traj))
    axes[0].plot(steps, loss(sgd_traj), 'b-', lw=2, label='SGD')
    axes[0].plot(steps, loss(adam_traj), 'r-', lw=2, label='Adam')
    axes[0].set_xlabel('Step'); axes[0].set_ylabel('Loss')
    axes[0].set_title('Loss vs Training Step')
    axes[0].legend(); axes[0].set_yscale('log')

    theta_plot = np.linspace(-3, 5, 300)
    axes[1].plot(theta_plot, loss(theta_plot), 'k-', lw=2, label='Loss surface')
    axes[1].plot(sgd_traj[:50], loss(sgd_traj[:50]), 'b.-', ms=4, alpha=0.7, label='SGD path')
    axes[1].plot(adam_traj[:50], loss(adam_traj[:50]), 'r.-', ms=4, alpha=0.7, label='Adam path')
    axes[1].set_xlabel('theta'); axes[1].set_ylabel('Loss')
    axes[1].set_title('Optimizer Trajectories on Loss Surface (first 50 steps)')
    axes[1].legend()
    plt.tight_layout(); plt.show()

12. Backpropagation — Chain Rule on Computation Graphs

We implement a complete 2-layer network forward/backward pass, verify gradients against numerical estimates, and visualize gradient flow through layers.

Code cell 30

# === 12. Full 2-Layer Network Backprop ===

import numpy as np

def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_prime(x): s = sigmoid(x); return s*(1-s)

class TwoLayerNet:
    def __init__(self, w1, b1, w2, b2):
        self.w1 = w1; self.b1 = b1
        self.w2 = w2; self.b2 = b2

    def forward(self, x, y):
        self.x = x
        self.z1 = self.w1*x + self.b1
        self.a1 = sigmoid(self.z1)
        self.z2 = self.w2*self.a1 + self.b2
        self.a2 = sigmoid(self.z2)
        self.loss = 0.5*(self.a2 - y)**2
        return self.loss

    def backward(self):
        # Layer 2
        dL_da2 = self.a2 - self.y if hasattr(self,'y') else None
        # Recompute with stored y
        return None  # see below

# Manual backprop
def backprop(x, y, w1, b1, w2, b2):
    # Forward
    z1 = w1*x + b1
    a1 = sigmoid(z1)
    z2 = w2*a1 + b2
    a2 = sigmoid(z2)
    L = 0.5*(a2 - y)**2

    # Backward
    dL_da2 = a2 - y
    da2_dz2 = sigmoid_prime(z2)
    dL_dz2 = dL_da2 * da2_dz2

    dL_dw2 = dL_dz2 * a1
    dL_db2 = dL_dz2

    dL_da1 = dL_dz2 * w2
    da1_dz1 = sigmoid_prime(z1)
    dL_dz1 = dL_da1 * da1_dz1

    dL_dw1 = dL_dz1 * x
    dL_db1 = dL_dz1

    return L, dict(w1=dL_dw1, b1=dL_db1, w2=dL_dw2, b2=dL_db2)

# Test values
x, y, w1, b1, w2, b2 = 1.5, 0.7, 2.0, -0.5, 1.2, 0.3
L, grads = backprop(x, y, w1, b1, w2, b2)

print('=== 2-Layer Network Gradient Check ===')
print(f'Loss: {L:.6f}')

# Numerical check via parameter perturbation
h = 1e-5
for param_name, param_val, idx in [('w1',w1,0),('b1',b1,1),('w2',w2,2),('b2',b2,3)]:
    params = [w1, b1, w2, b2]
    p_plus = params[:]; p_plus[idx] += h
    p_minus = params[:]; p_minus[idx] -= h
    L_p = backprop(x, y, *p_plus)[0]
    L_m = backprop(x, y, *p_minus)[0]
    numerical = (L_p - L_m) / (2*h)
    analytic = grads[param_name]
    ok = abs(analytic - numerical) < 1e-6
    print(f"  {'PASS' if ok else 'FAIL'} d{param_name}: analytic={analytic:.8f}, "
          f"numerical={numerical:.8f}")

13. Softmax and Cross-Entropy Gradient

The gradient of cross-entropy loss with respect to softmax logits has a beautifully simple form: $\partial\mathcal{L}/\partial z_j = p_j - y_j$ .

Code cell 32

# === 13. Softmax + Cross-Entropy Gradient ===

import numpy as np

def softmax(z):
    z_stable = z - np.max(z)  # numerical stability
    e = np.exp(z_stable)
    return e / e.sum()

def cross_entropy(p, y):
    return -np.sum(y * np.log(p + 1e-12))

def softmax_ce_grad(z, y):
    """Gradient of CE loss w.r.t. logits z — should equal p - y."""
    p = softmax(z)
    return p - y

# Test
K = 5
np.random.seed(42)
z = np.random.randn(K)
y = np.zeros(K); y[2] = 1.0  # true class = 2

p = softmax(z)
L = cross_entropy(p, y)
analytic_grad = softmax_ce_grad(z, y)

# Numerical gradient
h = 1e-5
numerical_grad = np.zeros(K)
for i in range(K):
    z_p = z.copy(); z_p[i] += h
    z_m = z.copy(); z_m[i] -= h
    numerical_grad[i] = (cross_entropy(softmax(z_p), y) - cross_entropy(softmax(z_m), y)) / (2*h)

print('=== Softmax + Cross-Entropy Gradient ===')
print(f'Logits z: {z}')
print(f'Probs  p: {p}')
print(f'Loss   L: {L:.6f}')
print(f'\nGradient comparison (should be p - y):')
print(f'  p - y:     {p - y}')
print(f'  Analytic:  {analytic_grad}')
print(f'  Numerical: {numerical_grad}')
err = np.linalg.norm(analytic_grad - numerical_grad)
print(f'\nPASS - gradient error: {err:.2e}' if err < 1e-5 else f'FAIL - error: {err:.2e}')

14. Vanishing Gradients — Quantitative Analysis

We simulate gradient propagation through $L$ sigmoid layers and observe exponential decay. This motivates ReLU and residual connections in deep networks.

Code cell 34

# === 14. Vanishing Gradient Simulation ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_prime(x): s = sigmoid(x); return s*(1-s)
def relu_prime(x): return float(x > 0)

np.random.seed(42)
max_layers = 50

# Simulate gradient magnitude at layer 1 as function of depth L
sigmoid_grads = []
relu_grads = []

for L in range(1, max_layers+1):
    # Random pre-activations (sigmoid tends to saturate)
    z_sigmoid = np.random.randn(L) * 2  # some saturation
    z_relu = np.abs(np.random.randn(L))  # mostly positive

    grad_sig = np.prod([sigmoid_prime(z) for z in z_sigmoid])
    grad_relu = np.prod([relu_prime(z) for z in z_relu])

    sigmoid_grads.append(abs(grad_sig))
    relu_grads.append(abs(grad_relu))

layers = np.arange(1, max_layers+1)

print('=== Gradient Magnitude at Layer 1 vs Network Depth ===')
for L in [1, 5, 10, 20, 50]:
    print(f'  L={L:2d}: sigmoid={sigmoid_grads[L-1]:.2e}, relu={relu_grads[L-1]:.2e}')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.semilogy(layers, sigmoid_grads, 'b-', lw=2, label='Sigmoid activations')
    ax.semilogy(layers, relu_grads, 'r-', lw=2, label='ReLU activations')
    ax.axhline(1e-7, color='gray', ls=':', label='Float32 precision floor')
    ax.set_xlabel('Number of layers L')
    ax.set_ylabel('|Gradient at layer 1|')
    ax.set_title('Vanishing Gradient: Sigmoid vs ReLU')
    ax.legend()
    plt.tight_layout(); plt.show()
    print('\nConclusion: sigmoid gradients vanish exponentially;')
    print('ReLU maintains O(1) gradient for active units.')

15. Numerically Stable Derivative Computations

Floating-point arithmetic introduces subtle errors. We compare naive vs. stable implementations for sigmoid, log-sigmoid, and softmax.

Code cell 36

# === 15. Numerical Stability ===

import numpy as np

# Naive sigmoid — overflows for large negative x
def sigmoid_naive(x): return 1 / (1 + np.exp(-x))

# Stable sigmoid
def sigmoid_stable(x):
    return np.where(x >= 0,
                    1 / (1 + np.exp(-x)),
                    np.exp(x) / (1 + np.exp(x)))

# Naive log-sigmoid
def log_sigmoid_naive(x): return np.log(sigmoid_naive(x))

# Stable log-sigmoid
def log_sigmoid_stable(x): return -np.logaddexp(0, -x)

test_vals = [-1000, -100, -10, 0, 10, 100, 1000]

print('=== Sigmoid Stability ===')
print(f'{"x":>8} {"Naive":>15} {"Stable":>15} {"True":>15}')
for x in test_vals:
    naive = sigmoid_naive(np.float64(x))
    stable = sigmoid_stable(np.float64(x))
    true_val = 1/(1+np.exp(-np.float64(x))) if abs(x) < 700 else (1.0 if x > 0 else 0.0)
    print(f'{x:8.0f} {naive:15.8f} {stable:15.8f} {true_val:15.8f}')

print('\n=== Log-Sigmoid Stability ===')
print(f'{"x":>8} {"Naive":>15} {"Stable":>15}')
for x in [-100, -10, 0, 10, 100]:
    with np.errstate(divide='ignore', invalid='ignore'):
        naive = log_sigmoid_naive(np.float64(x))
    stable = log_sigmoid_stable(np.float64(x))
    print(f'{x:8.0f} {naive:15.8f} {stable:15.8f}')
print('\nNaive gives -inf or 0 for large |x|; stable is correct throughout.')

16. Logarithmic Differentiation

When the function is a product/quotient of many terms, taking $\ln$ first and differentiating implicitly is often much simpler.

Code cell 38

# === 16. Logarithmic Differentiation ===

import numpy as np

def centered(f, x, h=1e-6): return (f(x+h) - f(x-h)) / (2*h)

# f(x) = x^x
# ln f = x ln x => f'/f = ln x + 1 => f' = x^x * (ln x + 1)
def f_xx(x): return x**x
def f_xx_prime(x): return x**x * (np.log(x) + 1)

x = 2.0
print(f'x^x derivative at x=2:')
print(f'  Analytic: {f_xx_prime(x):.8f}')
print(f'  Numerical: {centered(f_xx, x):.8f}')
print(f'  PASS: {abs(f_xx_prime(x) - centered(f_xx, x)) < 1e-5}')

# f(x) = (sin x)^x for x > 0
# ln f = x ln(sin x) => f'/f = ln(sin x) + x*cos(x)/sin(x)
def f_sinx_x(x): return np.sin(x)**x
def f_sinx_x_prime(x):
    return np.sin(x)**x * (np.log(np.sin(x)) + x*np.cos(x)/np.sin(x))

x = 1.5
print(f'\n(sin x)^x derivative at x=1.5:')
print(f'  Analytic: {f_sinx_x_prime(x):.8f}')
print(f'  Numerical: {centered(f_sinx_x, x):.8f}')
print(f'  PASS: {abs(f_sinx_x_prime(x) - centered(f_sinx_x, x)) < 1e-5}')

# Demonstrate for multi-factor expression
# g(x) = (x^3 * sqrt(x+1)) / (x^2+2)^4
def g(x): return (x**3 * np.sqrt(x+1)) / (x**2+2)**4
def g_prime(x):
    # ln g = 3 ln x + 0.5 ln(x+1) - 4 ln(x^2+2)
    # g'/g = 3/x + 1/(2(x+1)) - 8x/(x^2+2)
    return g(x) * (3/x + 1/(2*(x+1)) - 8*x/(x**2+2))

x = 1.5
print(f'\nMulti-factor g(x) at x=1.5:')
print(f'  Analytic: {g_prime(x):.8f}')
print(f'  Numerical: {centered(g, x):.8f}')
print(f'  PASS: {abs(g_prime(x) - centered(g, x)) < 1e-5}')

17. Inverse Function Theorem

If $f'(a)\neq 0$ , then $f^{-1}$ is differentiable at $b=f(a)$ with $(f^{-1})'(b) = 1/f'(a)$ . We use this to derive all inverse trig derivatives.

Code cell 40

# === 17. Inverse Function Theorem ===

import numpy as np

def centered(f, x, h=1e-6): return (f(x+h) - f(x-h)) / (2*h)

# arcsin: (arcsin x)' = 1/sqrt(1-x^2)
def arcsin_prime(x): return 1/np.sqrt(1-x**2)

# arctan: (arctan x)' = 1/(1+x^2)
def arctan_prime(x): return 1/(1+x**2)

test_pts = [0.0, 0.3, -0.5]
print('=== Inverse Trig Derivatives ===')
for x in test_pts:
    a_asin = arcsin_prime(x); n_asin = centered(np.arcsin, x)
    a_atan = arctan_prime(x); n_atan = centered(np.arctan, x)
    print(f'x={x:+.1f}: arcsin\' analytic={a_asin:.6f}, numerical={n_asin:.6f} '
          f'| arctan\' analytic={a_atan:.6f}, numerical={n_atan:.6f}')

print('\nAll PASS:', all(
    abs(arcsin_prime(x) - centered(np.arcsin, x)) < 1e-5 and
    abs(arctan_prime(x) - centered(np.arctan, x)) < 1e-5
    for x in test_pts
))

# IFT: verify (f^-1)'(f(x)) = 1/f'(x) for f = exp
print('\nIFT check: (ln)\' at y = e^x = f(x), should equal 1/f\'(x) = 1/e^x')
for x in [0.5, 1.0, 2.0]:
    y = np.exp(x)
    inv_prime_at_y = 1/y  # (ln)' at y
    ift_formula = 1/np.exp(x)  # 1/f'(x)
    print(f'  x={x}: (ln)\'(e^x)={inv_prime_at_y:.6f}, 1/e^x={ift_formula:.6f}')

18. Mean Value Theorem — Visualization

We visualize the MVT: for $f$ on $[a,b]$ , there exists $c$ where the tangent is parallel to the secant. Applications: monotonicity, Lipschitz bounds.

Code cell 42

# === 18. MVT Visualization ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

def f(x): return np.sin(x) + 0.3*x
def fp(x): return np.cos(x) + 0.3

a, b = 0.5, 4.0
secant_slope = (f(b) - f(a)) / (b - a)

# Find c where f'(c) = secant_slope numerically
x_scan = np.linspace(a, b, 10000)
fp_vals = fp(x_scan)
# c where fp closest to secant_slope
c_idx = np.argmin(abs(fp_vals - secant_slope))
c_mvt = x_scan[c_idx]

print(f'f(x) = sin(x) + 0.3x on [{a}, {b}]')
print(f'Secant slope: {secant_slope:.4f}')
print(f'MVT point c = {c_mvt:.4f}, f\'(c) = {fp(c_mvt):.4f}')

if HAS_MPL:
    x_plot = np.linspace(0, 4.5, 300)
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(x_plot, f(x_plot), 'b-', lw=2, label='f(x)')

    # Secant line
    x_sec = np.linspace(a-0.2, b+0.2, 50)
    y_sec = f(a) + secant_slope*(x_sec - a)
    ax.plot(x_sec, y_sec, 'r-', lw=2, label=f'Secant (slope={secant_slope:.2f})')

    # Tangent at c
    x_tang = np.linspace(c_mvt-0.8, c_mvt+0.8, 50)
    y_tang = f(c_mvt) + fp(c_mvt)*(x_tang - c_mvt)
    ax.plot(x_tang, y_tang, 'g--', lw=2, label=f'Tangent at c={c_mvt:.2f}')

    ax.plot([a, b], [f(a), f(b)], 'ro', ms=8)
    ax.plot(c_mvt, f(c_mvt), 'g^', ms=10, label='MVT point c')
    ax.set_xlabel('x'); ax.set_ylabel('f(x)')
    ax.set_title('Mean Value Theorem')
    ax.legend()
    plt.tight_layout(); plt.show()

19. Derivatives and Taylor Approximation

The first-order Taylor approximation $f(x) \approx f(a) + f'(a)(x-a)$ is the derivative's most direct practical application. We verify approximation quality across multiple functions.

Code cell 44

# === 19. Linear (First-Order Taylor) Approximation ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

def linear_approx(f, fp, a, x):
    return f(a) + fp(a)*(x - a)

functions = [
    ('sin(x)', np.sin, np.cos, np.pi/4),
    ('e^x', np.exp, np.exp, 0.0),
    ('ln(x)', np.log, lambda x: 1/x, 1.0),
    ('sqrt(x)', np.sqrt, lambda x: 1/(2*np.sqrt(x)), 4.0),
]

print('=== Linear Approximation Quality ===')
print(f'{"Function":>12} {"a":>6} {"x-a":>6} {"True f(x)":>12} {"Linear":>12} {"Error":>10}')

for name, f, fp, a in functions:
    for dx in [0.1, 0.5, 1.0]:
        x = a + dx
        true_val = f(x)
        lin_val = linear_approx(f, fp, a, x)
        error = abs(true_val - lin_val)
        print(f'{name:>12} {a:6.2f} {dx:6.2f} {true_val:12.6f} {lin_val:12.6f} {error:10.2e}')

print('\nPattern: error grows as O((x-a)^2) — the quadratic (second-order) term.')
print('The full story is in §04-Series-and-Sequences (Taylor series).')

20. Subgradients and Non-Smooth Functions

For non-differentiable convex functions (ReLU, |x|, L1 norm), subgradients generalize the derivative. Subgradient descent is used for L1 regularization.

Code cell 46

# === 20. Subgradients ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

# Subgradient of |x| at x=0 is any value in [-1,1]
# For optimization: use sign(x), subgradient 0 at x=0 is standard choice

x = np.linspace(-2, 2, 400)
f_abs = np.abs(x)
f_relu = np.maximum(0, x)

# Supporting hyperplanes at x=0 for |x|
x_supp = np.linspace(-1.5, 1.5, 50)
subgrads_to_show = [-1.0, -0.5, 0.0, 0.5, 1.0]

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(13, 5))

    ax = axes[0]
    ax.plot(x, f_abs, 'b-', lw=3, label='|x|', zorder=3)
    colors = plt.cm.plasma(np.linspace(0.1, 0.9, len(subgrads_to_show)))
    for g, c in zip(subgrads_to_show, colors):
        y_supp = g * x_supp  # supporting line: f(0) + g*(x-0) = g*x
        ax.plot(x_supp, y_supp, '--', color=c, alpha=0.7, lw=1.5,
                label=f'g={g:+.1f}')
    ax.set_xlim(-1.5, 1.5); ax.set_ylim(-0.2, 1.5)
    ax.set_xlabel('x'); ax.set_title('Subgradients of |x| at x=0')
    ax.legend(fontsize=9)

    ax2 = axes[1]
    ax2.plot(x, f_relu, 'r-', lw=3, label='ReLU(x)')
    for g, c in zip([0.0, 0.5, 1.0], ['blue','green','orange']):
        y_supp = g * x_supp
        ax2.plot(x_supp, y_supp, '--', color=c, alpha=0.7, label=f'g={g:.1f}')
    ax2.set_xlim(-1.5, 1.5); ax2.set_ylim(-0.3, 1.5)
    ax2.set_xlabel('x'); ax2.set_title('Subgradients of ReLU at x=0')
    ax2.legend()
    plt.tight_layout(); plt.show()

# Soft-thresholding (proximal operator of lambda*|x|)
lam = 0.3
def soft_threshold(v, lam):
    return np.sign(v) * np.maximum(np.abs(v) - lam, 0)

v_test = np.array([-1.0, -0.2, 0.0, 0.15, 0.5, 1.0])
print('=== Soft-Thresholding (Proximal Operator of L1) ===')
print(f'lambda = {lam}')
for v in v_test:
    print(f'  v={v:+.2f} -> ST={soft_threshold(v, lam):+.2f}')

21. Modern Activations — SiLU, Swish, Mish

Post-ReLU activations used in modern transformers. SiLU/Swish = $x \cdot \sigma(x)$ ; Mish = $x \cdot \tanh(\text{softplus}(x))$ .

Code cell 48

# === 21. Modern Activations ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    from scipy.special import erf
    HAS_MPL = True
except:
    HAS_MPL = False

def sigmoid(x): return 1/(1+np.exp(-np.clip(x,-500,500)))

# SiLU / Swish
def swish(x): return x * sigmoid(x)
def swish_prime(x):
    s = sigmoid(x)
    return s + x * s * (1 - s)

# Mish
def mish(x): return x * np.tanh(np.log(1+np.exp(np.clip(x,-500,500))))
def mish_prime(x):
    sp = np.log(1+np.exp(np.clip(x,-500,500)))  # softplus(x)
    ts = np.tanh(sp)
    sig = sigmoid(x)
    return ts + x * (1 - ts**2) * sig

# GELU for comparison
def gelu(x): return x * 0.5 * (1 + erf(x/np.sqrt(2)))
def gelu_prime(x):
    Phi = 0.5 * (1 + erf(x/np.sqrt(2)))
    phi = np.exp(-x**2/2) / np.sqrt(2*np.pi)
    return Phi + x*phi

# Verify derivatives
def centered(f, x, h=1e-6): return (f(x+h) - f(x-h)) / (2*h)

x = 1.5
print('=== Modern Activation Derivative Verification at x=1.5 ===')
for name, fn, fn_prime in [('Swish', swish, swish_prime), ('Mish', mish, mish_prime),
                             ('GELU', gelu, gelu_prime)]:
    a = fn_prime(x); n = centered(fn, x)
    print(f"  {'PASS' if abs(a-n)<1e-5 else 'FAIL'} {name}: analytic={a:.6f}, numerical={n:.6f}")

x_plot = np.linspace(-3, 3, 400)
if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    for fn, nm, c in [(swish,'Swish','b'), (mish,'Mish','r'), (gelu,'GELU','g'),
                       (lambda x: np.maximum(0,x), 'ReLU', 'k')]:
        axes[0].plot(x_plot, fn(x_plot), lw=2, color=c, label=nm)
    axes[0].set_ylim(-0.5, 3); axes[0].set_title('Modern Activations')
    axes[0].set_xlabel('x'); axes[0].legend()

    for fn, nm, c in [(swish_prime,'Swish\'','b'), (mish_prime,'Mish\'','r'),
                       (gelu_prime,'GELU\'','g')]:
        axes[1].plot(x_plot, fn(x_plot), lw=2, color=c, label=nm)
    axes[1].set_ylim(-0.2, 1.2); axes[1].set_title('Activation Derivatives')
    axes[1].set_xlabel('x'); axes[1].legend()
    plt.tight_layout(); plt.show()

22. Gradient Descent — Convergence Analysis

For $L$ -smooth functions, learning rate $\eta < 2/L$ guarantees descent. We verify the descent lemma numerically.

Code cell 50

# === 22. Gradient Descent Convergence ===

import numpy as np

try:
    import matplotlib.pyplot as plt
    HAS_MPL = True
except:
    HAS_MPL = False

# Quadratic: f(theta) = L/2 * theta^2, L-smooth with L = L
L_smooth = 5.0
def f_quad(theta): return (L_smooth/2) * theta**2
def grad_quad(theta): return L_smooth * theta

theta0 = 3.0
steps = 30

print(f'=== Gradient Descent on f(theta) = ({L_smooth}/2)*theta^2 ===')
print(f'L-smooth constant L = {L_smooth}')
print(f'Critical eta: 2/L = {2/L_smooth:.4f}')

learning_rates = [0.1, 0.3, 1/L_smooth, 0.35, 0.4]

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 6))

for eta in learning_rates:
    theta = theta0
    losses = [f_quad(theta)]
    for _ in range(steps):
        theta -= eta * grad_quad(theta)
        losses.append(f_quad(theta))
    converges = losses[-1] < 1e-6
    print(f'  eta={eta:.2f}: final loss={losses[-1]:.2e} ({"converges" if converges else "DIVERGES"})')
    if HAS_MPL:
        ax.semilogy(losses, lw=2, label=f'eta={eta:.2f}')

if HAS_MPL:
    ax.axhline(1e-6, color='gray', ls=':', label='Convergence threshold')
    ax.set_xlabel('Step'); ax.set_ylabel('Loss (log scale)')
    ax.set_title(f'GD on Quadratic (L={L_smooth}) — Effect of Learning Rate')
    ax.legend()
    plt.tight_layout(); plt.show()

print(f'\nConclusion: eta >= 2/L = {2/L_smooth:.2f} diverges; eta < 2/L converges.')

23. Score Function and Fisher Information

The derivative of the log-likelihood is the score. Its variance is the Fisher information, which connects calculus to statistical estimation and second-order optimization.

Code cell 52

# === 23. Score Function and Fisher Information ===

import numpy as np

# Gaussian family: p(x; mu) = (1/sqrt(2pi)) * exp(-(x-mu)^2/2)
# log p = const - (x-mu)^2/2
# score = d/dmu log p = x - mu
# Fisher info: E[(score)^2] = E[(x-mu)^2] = Var[X] = 1 (for standard normal)

np.random.seed(42)
mu_true = 2.0
n_samples = 10000
X = np.random.normal(mu_true, 1.0, n_samples)

# Compute score and Fisher info empirically
def score(x, mu): return x - mu
def log_likelihood(X, mu): return -0.5*np.sum((X - mu)**2) - len(X)*0.5*np.log(2*np.pi)

# MLE: dL/dmu = 0 => sum(x_i - mu) = 0 => mu = xbar
mu_mle = np.mean(X)

# Score at MLE should be ~0 (optimality condition)
score_at_mle = np.mean(score(X, mu_mle))

# Fisher info: E[(x-mu)^2] should be Var[X] = 1
fisher_empirical = np.mean(score(X, mu_true)**2)

print('=== Score Function and Fisher Information ===')
print(f'True mu: {mu_true}, MLE mu_hat: {mu_mle:.4f}')
print(f'Score at MLE (should be ~0): {score_at_mle:.6f}')
print(f'Empirical Fisher info: {fisher_empirical:.4f} (true: 1.0)')

# Fisher-Rao connection: Fisher info = -E[d^2 log p/dmu^2]
# d^2/dmu^2 log p = -1 for Gaussian, so E[-(-1)] = 1
fisher_via_hessian = -np.mean([-1.0 for _ in X])  # = 1
print(f'Fisher via Hessian: {fisher_via_hessian:.4f} (Fisher identity: I = -E[f\'\'])')

# Cramer-Rao bound: Var[mu_hat] >= 1/I(mu)
print(f'\nCramer-Rao bound: Var[MLE] >= 1/I = {1/fisher_empirical:.4f}')
print(f'Actual Var[X_bar] = 1/n = {1/n_samples:.4f} (achieves bound — MLE is efficient)')

Summary

This notebook covered the full theory of differentiation:

Section	Key Result
§1 Intuition	Secant → tangent line as $h\to 0$
§2 Definition	$f'(a) = \lim_{h\to 0}[f(a+h)-f(a)]/h$
§3 Rules	Power, product, quotient, chain, elementary functions
§4 Chain Rule	$(f\circ g)' = f'(g)\cdot g'$ — backbone of backprop
§5 Implicit	$dy/dx = -F_x/F_y$ ; linear approximation
§6 Second Derivative	Concavity, inflection, $f''$ at critical points
§7 Extrema / MVT	First/second derivative tests; Lipschitz bounds
§8 Activations	$\sigma'=\sigma(1-\sigma)$ , ReLU', GELU'; vanishing gradients
§9 Numerical Diff	Forward $O(h)$ , centered $O(h^2)$ , optimal $h\approx 10^{-5.3}$
§10 Proofs	Product rule, chain rule, differentiable→continuous
§11 Adam	First/second moment estimates; adaptive learning rates
§12 Backprop	Full 2-layer network chain rule derivation and verification
§13 Softmax+CE	$\partial\mathcal{L}/\partial z_j = p_j - y_j$
§14 Vanishing Grad	Exponential decay through sigmoid layers; ReLU advantage
§15 Stability	Numerically stable sigmoid, log-sigmoid, softmax

Next: Integration →