Theory Notebook

Converted from theory.ipynb for web reading.

Integration — Theory Notebook

"To integrate is to make whole — to recover from the rate of change the quantity that changed."

Interactive exploration of integration: Riemann sums, the Fundamental Theorem, techniques, numerical methods, and connections to probability and machine learning.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import math
import numpy as np
import numpy.linalg as la
from scipy import integrate, special, stats
from math import factorial
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok



def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def forward_diff(f, x, h=1e-6):
    return (f(x + h) - f(x)) / h

def backward_diff(f, x, h=1e-6):
    return (f(x) - f(x - h)) / h



def grad_check(f, x, analytic_grad, h=1e-6):
    x = np.asarray(x, dtype=float)
    analytic_grad = np.asarray(analytic_grad, dtype=float)
    numeric_grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        x_plus = x.copy(); x_minus = x.copy()
        x_plus[idx] += h; x_minus[idx] -= h
        numeric_grad[idx] = (f(x_plus) - f(x_minus)) / (2 * h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

print("Chapter helper setup complete.")

1. Intuition — Integration as Accumulation

The integral $\int_a^b f(x)\,dx$ answers: how much total change accumulates when the rate is $f(x)$ ? It is simultaneously:

Area under the curve (geometry)
Accumulated change given a rate (physics)
Expected value of $f(X)$ for $X \sim \text{Uniform}[a,b]$ (probability)
Infinite sum of infinitesimal slices (limiting process)

Code cell 5

# === 1.1 Integration as Area — Visual Demo ===

import numpy as np

def f(x): return np.sin(x) + 1.5  # positive function for clarity

a, b = 0, np.pi
true_area, _ = integrate.quad(f, a, b)
print(f'True area under sin(x)+1.5 on [0,π]: {true_area:.6f}')
print(f'Expected: π*1.5 + 2 = {np.pi*1.5 + 2:.6f}')  # ∫sin = 2, ∫1.5 = 1.5π

if HAS_MPL:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    x_fine = np.linspace(a, b, 400)
    n_list = [4, 8, 32]
    for ax, n in zip(axes, n_list):
        xs = np.linspace(a, b, n+1)
        h = (b-a)/n
        riemann = sum(f(xs[i]) * h for i in range(n))
        ax.fill_between(x_fine, f(x_fine), alpha=0.3, color='steelblue', label='True area')
        ax.plot(x_fine, f(x_fine), 'k-', lw=2)
        for i in range(n):
            rect = patches.Rectangle((xs[i], 0), h, f(xs[i]),
                                     linewidth=0.5, edgecolor='navy', facecolor='orange', alpha=0.5)
            ax.add_patch(rect)
        ax.set_title(f'n={n}: R_n = {riemann:.4f}')
        ax.set_xlabel('x'); ax.set_ylabel('f(x)')
    plt.suptitle('Left Riemann Sums Converging to True Area', fontsize=14)
    plt.tight_layout(); plt.show()
    print(f'True value: {true_area:.6f}')

2. Riemann Sums — The Formal Definition

For a partition $P = \{a = x_0 < x_1 < \cdots < x_n = b\}$ and sample points $x_i^* \in [x_{i-1}, x_i]$ :

\int_a^b f(x)\,dx = \lim_{\|P\|\to 0} \sum_{i=1}^n f(x_i^*)\,(x_i - x_{i-1})

Left sums: $x_i^* = x_{i-1}$ ; Right sums: $x_i^* = x_i$ ; Midpoint sums: $x_i^* = (x_{i-1}+x_i)/2$ .

Code cell 7

# === 2.1 Convergence Rates of Riemann Sums ===

import numpy as np

def f(x): return np.exp(-x**2)  # Gaussian: no closed form integral
a, b = 0.0, 2.0
true_val = np.sqrt(np.pi)/2 * special.erf(2)  # erf(2) ≈ 0.9953

ns = [4, 8, 16, 32, 64, 128, 256, 512]
errors_left, errors_mid, errors_right = [], [], []

for n in ns:
    h = (b-a)/n
    xs = np.linspace(a, b, n+1)
    left  = h * np.sum(f(xs[:-1]))
    right = h * np.sum(f(xs[1:]))
    mid   = h * np.sum(f((xs[:-1] + xs[1:])/2))
    errors_left.append(abs(left - true_val))
    errors_right.append(abs(right - true_val))
    errors_mid.append(abs(mid - true_val))

print(f'True ∫₀² e^(-x²) dx = {true_val:.8f}')
print(f'{'n':>6} | {'Left err':>12} | {'Mid err':>12} | {'Right err':>12}')
for i, n in enumerate(ns):
    print(f'{n:6d} | {errors_left[i]:12.2e} | {errors_mid[i]:12.2e} | {errors_right[i]:12.2e}')

# Log-log slope: should be ~1 for left/right, ~2 for midpoint
log_ns = np.log(ns)
slope_left = np.polyfit(log_ns, np.log(errors_left), 1)[0]
slope_mid  = np.polyfit(log_ns, np.log(errors_mid),  1)[0]
print(f'\nConvergence order — Left: {slope_left:.2f}, Midpoint: {slope_mid:.2f}')
print('(Expected: Left/Right ~ O(h^1) = -1, Midpoint ~ O(h^2) = -2)')

3. Fundamental Theorem of Calculus

Part 1 (Differentiation of accumulation functions):

\frac{d}{dx}\int_a^x f(t)\,dt = f(x)

Part 2 (Evaluation via antiderivatives):

\int_a^b f(x)\,dx = F(b) - F(a) \quad \text{where } F' = f

The FTC bridges differentiation and integration — they are inverse operations.

Code cell 9

# === 3.1 FTC Part 1 — Accumulation Function ===

import numpy as np

# F(x) = ∫₀ˣ cos(t) dt = sin(x)
# FTC Part 1: F'(x) = cos(x)

xs = np.linspace(0, 2*np.pi, 200)

# Numerical accumulation function
F_numerical = np.array([integrate.quad(np.cos, 0, x)[0] for x in xs])
F_analytic  = np.sin(xs)

check('F(x) = ∫₀ˣ cos(t)dt = sin(x)', F_numerical, F_analytic, tol=1e-8)

# FTC Part 1: derivative of F_numerical ≈ cos(x)
h = xs[1] - xs[0]
dF_numerical = np.gradient(F_numerical, h)
cos_xs = np.cos(xs)

interior = slice(5, -5)  # avoid boundary effects
check('d/dx[∫₀ˣ cos] ≈ cos(x)', dF_numerical[interior], cos_xs[interior], tol=1e-4)

print('\n=== FTC Part 2: ∫₀^π sin(x) dx = 2 ===')
result, _ = integrate.quad(np.sin, 0, np.pi)
check('∫₀^π sin(x) dx = 2', result, 2.0)

print('\nKey antiderivatives verified:')
tests = [
    ('∫x²dx from 0 to 1 = 1/3', lambda x: x**2, 0, 1, 1/3),
    ('∫eˣdx from 0 to 1 = e-1',  np.exp,         0, 1, np.e-1),
    ('∫1/x dx from 1 to e = 1',  lambda x: 1/x, 1, np.e, 1.0),
]
for name, fn, a, b, expected in tests:
    val, _ = integrate.quad(fn, a, b)
    check(name, val, expected)

Code cell 10

# === 3.2 Leibniz Rule — Differentiating Under the Integral Sign ===

import numpy as np

# F(a) = ∫₀^a e^(-ax²) dx
# dF/da via Leibniz: d/da ∫₀^a e^(-at²)dt
#   = e^{-a·a²} * 1  +  ∫₀^a ∂/∂a [e^{-at²}] dt
#   = e^{-a³}  +  ∫₀^a (-t²)e^{-at²} dt

def F_leibniz(a_val):
    return integrate.quad(lambda t: np.exp(-a_val * t**2), 0, a_val)[0]

def dF_leibniz_analytic(a_val):
    # Leibniz: upper limit term + integral of derivative
    upper_term = np.exp(-a_val**3)
    integral_term = integrate.quad(
        lambda t: -t**2 * np.exp(-a_val * t**2), 0, a_val)[0]
    return upper_term + integral_term

def dF_numerical(a_val, h=1e-6):
    return (F_leibniz(a_val+h) - F_leibniz(a_val-h)) / (2*h)

print('Leibniz Rule Verification:')
for a_val in [0.5, 1.0, 1.5, 2.0]:
    analytic = dF_leibniz_analytic(a_val)
    numerical = dF_numerical(a_val)
    ok = abs(analytic - numerical) < 1e-5
    print(f"  {'PASS' if ok else 'FAIL'} a={a_val}: analytic={analytic:.6f}, numerical={numerical:.6f}")

print('\nLeibniz rule: d/da ∫_{g(a)}^{h(a)} f(x,a)dx = f(h(a),a)h\'(a) - f(g(a),a)g\'(a) + ∫∂f/∂a dx')

4. u-Substitution

The reverse chain rule: if the integrand has the form $f(g(x))\,g'(x)$ , set $u = g(x)$ :

\int f(g(x))\,g'(x)\,dx = \int f(u)\,du

For definite integrals, transform the limits: $x=a \Rightarrow u=g(a)$ , $x=b \Rightarrow u=g(b)$ .

Code cell 12

# === 4.1 u-Substitution Examples ===

import numpy as np

print('=== u-Substitution Verification ===')

# (a) ∫ 2x·cos(x²) dx = sin(x²) + C
#   u = x², du = 2x dx => ∫cos(u)du = sin(u) = sin(x²)
def f_a(x): return 2*x * np.cos(x**2)
antideriv_a = lambda x: np.sin(x**2)
# Verify: derivative of antiderivative = integrand
x_test = 1.3
d_antideriv = (antideriv_a(x_test+1e-7) - antideriv_a(x_test-1e-7)) / 2e-7
check('d/dx[sin(x²)] = 2x·cos(x²)', d_antideriv, f_a(x_test))

# (b) ∫₀¹ x·e^(x²) dx — definite integral with limits
#   u = x², du = 2x dx => (1/2)∫₀¹ e^u du = (1/2)(e-1)
val_b, _ = integrate.quad(lambda x: x*np.exp(x**2), 0, 1)
expected_b = (np.e - 1)/2
check('∫₀¹ x·e^(x²) dx = (e-1)/2', val_b, expected_b)

# (c) ∫ sin³(x) dx — repeated substitution
#   = ∫ sin²(x)·sin(x) dx = ∫(1-cos²x)·sin(x) dx
#   u = cos(x), du = -sin(x) dx => -∫(1-u²)du = -u + u³/3 = -cos(x)+cos³(x)/3
def antideriv_c(x): return -np.cos(x) + np.cos(x)**3/3
def f_c(x): return np.sin(x)**3
d_c = (antideriv_c(x_test+1e-7) - antideriv_c(x_test-1e-7)) / 2e-7
check('d/dx[-cos(x)+cos³(x)/3] = sin³(x)', d_c, f_c(x_test))

# (d) Normalizing flows change-of-variables
# If z = g(x), then p_X(x) = p_Z(g(x))|g'(x)|
# (1D version — full Jacobian in multivariable)
print('\n--- Normalizing Flow change-of-variables ---')
def g(x): return x**3  # invertible for all x
def g_inv(z): return np.cbrt(z)
def g_prime(x): return 3*x**2

# Standard normal Z => transform to X = g⁻¹(Z)
from scipy.stats import norm
x_val = 1.5
z_val = g(x_val)
p_Z = norm.pdf(z_val)
p_X = p_Z * abs(g_prime(x_val))  # change of variables
print(f'x={x_val}, z=g(x)={z_val:.4f}')
print(f'p_Z(z) = {p_Z:.6f}, |g\'(x)| = {g_prime(x_val):.4f}')
print(f'p_X(x) = p_Z(g(x))|g\'(x)| = {p_X:.6f}')

5. Integration by Parts

The reverse product rule: $\int u\,dv = uv - \int v\,du$

LIATE order for choosing $u$ : Logarithms, Inverse trig, Algebraic, Trig, Exponential (choose $u$ from leftmost category present).

ML connection: The REINFORCE gradient estimator uses integration by parts (the log-derivative trick) to compute gradients of expected values.

Code cell 14

# === 5.1 Integration by Parts Examples ===

import numpy as np

print('=== Integration by Parts Verification ===')

# (a) ∫ x·eˣ dx = xeˣ - eˣ + C = eˣ(x-1) + C
def antideriv_a(x): return np.exp(x) * (x - 1)
def f_a(x): return x * np.exp(x)
x_test = 1.5
d_a = (antideriv_a(x_test+1e-7) - antideriv_a(x_test-1e-7)) / 2e-7
check('d/dx[eˣ(x-1)] = x·eˣ', d_a, f_a(x_test))

# (b) ∫ ln(x) dx = x·ln(x) - x + C
def antideriv_b(x): return x*np.log(x) - x
def f_b(x): return np.log(x)
x_test = 2.0
d_b = (antideriv_b(x_test+1e-7) - antideriv_b(x_test-1e-7)) / 2e-7
check('d/dx[x·ln(x)-x] = ln(x)', d_b, f_b(x_test))

# (c) ∫ eˣ·sin(x) dx — cyclic trick
# Let I = ∫eˣ·sin(x) dx
# IBP twice: I = eˣ·sin(x) - eˣ·cos(x) - I => 2I = eˣ(sin(x)-cos(x))
def antideriv_c(x): return np.exp(x)*(np.sin(x)-np.cos(x))/2
def f_c(x): return np.exp(x)*np.sin(x)
d_c = (antideriv_c(x_test+1e-7) - antideriv_c(x_test-1e-7)) / 2e-7
check('d/dx[eˣ(sin-cos)/2] = eˣ·sin(x)', d_c, f_c(x_test))

# (d) REINFORCE: ∇_θ E_pθ[f(x)] = E_pθ[f(x)·∇_θ log pθ(x)]
print('\n--- REINFORCE gradient estimator demo ---')
# Bernoulli: pθ(1)=θ, pθ(0)=1-θ; f(x) = x (reward)
# Analytic gradient: d/dθ E[X] = d/dθ θ = 1
# REINFORCE estimate: E[f(x) * ∇log pθ(x)]
theta = 0.4
n_samples = 100000
np.random.seed(42)
x_samples = (np.random.rand(n_samples) < theta).astype(float)
log_grad = np.where(x_samples == 1, 1/theta, -1/(1-theta))
reinforce_grad = np.mean(x_samples * log_grad)
print(f'REINFORCE gradient estimate: {reinforce_grad:.4f} (true: 1.0)')
check('REINFORCE ≈ 1.0', reinforce_grad, 1.0, tol=0.02)

6. Partial Fractions

Rational functions $P(x)/Q(x)$ can be split into simpler fractions:

Distinct linear factors: $\frac{A}{x-a} + \frac{B}{x-b}$
Repeated linear factors: $\frac{A}{x-a} + \frac{B}{(x-a)^2}$
Irreducible quadratics: $\frac{Ax+B}{x^2+bx+c}$

Code cell 16

# === 6.1 Partial Fractions — Numerical Verification ===

import numpy as np
from scipy import integrate

# ∫ 1/((x-1)(x+2)) dx
# PFD: 1/((x-1)(x+2)) = A/(x-1) + B/(x+2)
# A = 1/(1+2) = 1/3, B = 1/(-2-1) = -1/3
# => (1/3)∫1/(x-1)dx - (1/3)∫1/(x+2)dx = (1/3)ln|x-1| - (1/3)ln|x+2| + C

def f_original(x): return 1/((x-1)*(x+2))
def antideriv(x): return (np.log(abs(x-1)) - np.log(abs(x+2)))/3

# Verify antiderivative
x_test = 2.5
d_num = (antideriv(x_test+1e-7) - antideriv(x_test-1e-7)) / 2e-7

ok = abs(d_num - f_original(x_test)) < 1e-5
print(f"{'PASS' if ok else 'FAIL'} — antiderivative of 1/((x-1)(x+2)) verified at x=2.5")

# Definite integral
a, b = 2.0, 4.0
analytical = antideriv(b) - antideriv(a)
numerical, _ = integrate.quad(f_original, a, b)

ok2 = abs(analytical - numerical) < 1e-8
print(f"{'PASS' if ok2 else 'FAIL'} — ∫₂⁴ 1/((x-1)(x+2)) dx: analytic={analytical:.8f}, numerical={numerical:.8f}")

# Case 2: Irreducible quadratic
# ∫ 1/(x²+1) dx = arctan(x) + C
val, _ = integrate.quad(lambda x: 1/(x**2+1), 0, 1)
ok3 = abs(val - np.pi/4) < 1e-8
print(f"{'PASS' if ok3 else 'FAIL'} — ∫₀¹ 1/(x²+1) dx = π/4 = {np.pi/4:.8f}, got {val:.8f}")

7. Improper Integrals

Type I (infinite limits): $\int_a^\infty f(x)\,dx = \lim_{R\to\infty}\int_a^R f(x)\,dx$

Type II (unbounded integrand): $\int_a^b f(x)\,dx = \lim_{\epsilon\to 0^+}\int_{a+\epsilon}^b f(x)\,dx$

p-integral test: $\int_1^\infty x^{-p}\,dx$ converges $\iff p > 1$ .

Gaussian integral: $\int_{-\infty}^\infty e^{-x^2}\,dx = \sqrt{\pi}$ — central to ML statistics.

Code cell 18

# === 7.1 Improper Integrals - Convergence ===

import numpy as np
from scipy import integrate, special
from scipy.stats import norm

print('=== p-Integral Convergence Test ===')
ps = [0.5, 0.9, 1.0, 1.1, 2.0, 3.0]
for p in ps:
    expected_conv = p > 1
    if expected_conv:
        val, _ = integrate.quad(lambda x: x**(-p), 1, np.inf, limit=200)
        analytic = 1.0 / (p - 1.0)
        ok = abs(val - analytic) < 1e-6
        print(f"  {'PASS' if ok else 'FAIL'} p={p}: value={val:.4f}, analytic={analytic:.4f}")
    else:
        print(f"  PASS p={p}: diverges by the p-integral test")

print('\n=== Gaussian Integral ===')
gaussian, _ = integrate.quad(lambda x: np.exp(-x**2), -50, 50, limit=200)
print(f'integral exp(-x^2) dx ~= {gaussian:.8f}')
print(f'sqrt(pi)                 = {np.sqrt(np.pi):.8f}')
check('Gaussian integral = sqrt(pi)', gaussian, np.sqrt(np.pi), tol=1e-6)

print('\n=== Standard Normal Normalization ===')
norm_integral, _ = integrate.quad(norm.pdf, -50, 50, limit=200)
check('Integral of N(0,1) pdf = 1', norm_integral, 1.0, tol=1e-8)
print('Takeaway: improper integrals need analytic convergence checks before numerical quadrature.')

8. Numerical Integration

When analytic antiderivatives don't exist, we approximate:

Method	Formula	Error
Trapezoid	$\frac{h}{2}(f_0 + 2f_1 + \cdots + 2f_{n-1} + f_n)$	$O(h^2)$
Simpson's	$\frac{h}{3}(f_0 + 4f_1 + 2f_2 + \cdots + 4f_{n-1} + f_n)$	$O(h^4)$
Monte Carlo	$\frac{b-a}{n}\sum f(x_i)$ , $x_i \sim U[a,b]$	$O(n^{-1/2})$

Monte Carlo is dimension-independent — unlike grid methods which scale as $O(n^{-2/d})$ .

Code cell 20

# === 8.1 Numerical Integration — Accuracy Comparison ===

import numpy as np
from scipy import integrate, special

# Integrate f(x) = e^(-x²) on [0, 2]
# True value: (√π/2) * erf(2)
true_val = np.sqrt(np.pi)/2 * special.erf(2)
a, b = 0.0, 2.0

def f(x): return np.exp(-x**2)

def trapezoid(f, a, b, n):
    xs = np.linspace(a, b, n+1)
    h = (b-a)/n
    return h * (f(xs[0])/2 + np.sum(f(xs[1:-1])) + f(xs[-1])/2)

def simpsons(f, a, b, n):
    if n % 2 != 0: n += 1
    xs = np.linspace(a, b, n+1)
    h = (b-a)/n
    coeffs = np.ones(n+1)
    coeffs[1:-1:2] = 4
    coeffs[2:-2:2] = 2
    return (h/3) * np.dot(coeffs, f(xs))

def monte_carlo(f, a, b, n, seed=42):
    np.random.seed(seed)
    xs = np.random.uniform(a, b, n)
    return (b-a) * np.mean(f(xs))

print(f'True value: {true_val:.10f}\n')
print(f'{'n':>8} | {'Trap error':>12} | {'Simp error':>12} | {'MC error':>12}')
for n in [10, 50, 100, 500, 1000]:
    t_err = abs(trapezoid(f, a, b, n) - true_val)
    s_err = abs(simpsons(f, a, b, n) - true_val)
    m_err = abs(monte_carlo(f, a, b, n) - true_val)
    print(f'{n:8d} | {t_err:12.2e} | {s_err:12.2e} | {m_err:12.2e}')

print('\nSimpson\'s converges much faster (O(h⁴) vs O(h²) for trapezoid)')
print('Monte Carlo: O(1/√n) regardless of dimension — see § on high-dim below')

Code cell 21

# === 8.2 Monte Carlo — Dimension Independence ===

import numpy as np

# Integrate f(x) = exp(-||x||²) over [-3,3]^d
# True value for d dimensions: (√π·erf(3))^d
from scipy.special import erf

def mc_integral_d(d, n=100000, seed=42):
    np.random.seed(seed)
    xs = np.random.uniform(-3, 3, (n, d))
    fvals = np.exp(-np.sum(xs**2, axis=1))
    vol = 6.0**d
    return vol * np.mean(fvals)

def true_val_d(d):
    return (np.sqrt(np.pi) * erf(3))**d

print(f'{'d':>3} | {'True value':>14} | {'MC estimate':>14} | {'Rel error':>10}')
for d in [1, 2, 5, 10, 20]:
    true = true_val_d(d)
    mc = mc_integral_d(d)
    rel_err = abs(mc - true) / abs(true)
    print(f'{d:3d} | {true:14.4e} | {mc:14.4e} | {rel_err:10.4f}')

print('\nKey insight: MC error stays ~O(1/√n) regardless of dimension.')
print('Grid methods (trapezoid): need n^d points => exponential cost.')
print('This is why mini-batch SGD works: it is Monte Carlo for E[∇L].')

9. Integration in Probability

Continuous probability lives in the integration framework:

P(a \le X \le b) = \int_a^b p(x)\,dx \qquad \mathbb{E}[f(X)] = \int f(x)\,p(x)\,dx

D_{KL}(p\|q) = \int p(x)\ln\frac{p(x)}{q(x)}\,dx \ge 0 \qquad H(p) = -\int p(x)\ln p(x)\,dx

Code cell 23

# === 9.1 PDF/CDF and Expected Values ===

import numpy as np
from scipy import integrate, stats

print('=== Gaussian PDF/CDF Verification ===')

# Standard normal
mu, sigma = 2.0, 1.5
dist = stats.norm(mu, sigma)

# Total probability = 1
total, _ = integrate.quad(dist.pdf, -100, 100)
ok1 = abs(total - 1.0) < 1e-8
print(f"{'PASS' if ok1 else 'FAIL'} — PDF integrates to 1: {total:.10f}")

# E[X] = μ
E_X, _ = integrate.quad(lambda x: x * dist.pdf(x), -100, 100)
ok2 = abs(E_X - mu) < 1e-6
print(f"{'PASS' if ok2 else 'FAIL'} — E[X] = μ = {mu}: got {E_X:.8f}")

# E[X²] = σ² + μ²
E_X2, _ = integrate.quad(lambda x: x**2 * dist.pdf(x), -100, 100)
expected_E_X2 = sigma**2 + mu**2
ok3 = abs(E_X2 - expected_E_X2) < 1e-6
print(f"{'PASS' if ok3 else 'FAIL'} — E[X²] = σ²+μ² = {expected_E_X2}: got {E_X2:.8f}")

# CDF from PDF via FTC
x0 = 3.0
cdf_from_integral, _ = integrate.quad(dist.pdf, -np.inf, x0)
cdf_direct = dist.cdf(x0)
ok4 = abs(cdf_from_integral - cdf_direct) < 1e-8
print(f"{'PASS' if ok4 else 'FAIL'} — CDF(3) = ∫₋∞³ p(x)dx: integral={cdf_from_integral:.8f}, scipy={cdf_direct:.8f}")

Code cell 24

# === 9.2 KL Divergence — Gibbs Inequality Verification ===

import numpy as np
from scipy import integrate, stats

def kl_div_gaussian(mu1, sigma1, mu2, sigma2):
    """KL(N(mu1,sigma1²) || N(mu2,sigma2²)) — analytic formula."""
    return (np.log(sigma2/sigma1)
            + (sigma1**2 + (mu1-mu2)**2) / (2*sigma2**2)
            - 0.5)

def kl_div_numerical(mu1, sigma1, mu2, sigma2):
    """KL via numerical integration."""
    p = stats.norm(mu1, sigma1)
    q = stats.norm(mu2, sigma2)
    def integrand(x):
        px = p.pdf(x)
        qx = q.pdf(x)
        return px * np.log(px/qx) if px > 1e-300 else 0.0
    val, _ = integrate.quad(integrand, mu1 - 10*sigma1, mu1 + 10*sigma1)
    return val

print('KL Divergence: Analytic vs Numerical')
test_cases = [
    (0.0, 1.0, 0.0, 1.0),   # KL(N(0,1)||N(0,1)) = 0
    (1.0, 1.0, 0.0, 1.0),   # KL(N(1,1)||N(0,1)) = 0.5
    (0.0, 1.0, 0.0, 2.0),   # different variance
    (2.0, 0.5, 0.0, 1.0),   # both shifted
]
for mu1, s1, mu2, s2 in test_cases:
    analytic = kl_div_gaussian(mu1, s1, mu2, s2)
    numerical = kl_div_numerical(mu1, s1, mu2, s2)
    ok = abs(analytic - numerical) < 1e-5 and analytic >= -1e-10
    print(f"  {'PASS' if ok else 'FAIL'} KL(N({mu1},{s1}²)||N({mu2},{s2}²)) = {analytic:.6f} (numerical={numerical:.6f})")

print('\nAll KL values ≥ 0 — Gibbs inequality verified.')

# VAE KL term: KL(N(mu,1) || N(0,1)) = mu²/2
mu_vals = [0.0, 1.0, 2.0, 3.0]
print('\nVAE KL penalty — KL(N(μ,1)||N(0,1)) = μ²/2:')
for mu in mu_vals:
    analytic = mu**2 / 2
    numerical = kl_div_gaussian(mu, 1.0, 0.0, 1.0)
    ok = abs(analytic - numerical) < 1e-6
    print(f"  {'PASS' if ok else 'FAIL'} μ={mu}: μ²/2={analytic:.4f}, formula={numerical:.6f}")

Code cell 25

# === 9.3 Entropy and Cross-Entropy ===

import numpy as np
from scipy import integrate, stats

print('=== Differential Entropy of Gaussian ===')
# H[N(mu,sigma²)] = (1/2) ln(2πe·sigma²)
sigma = 2.0
dist = stats.norm(0, sigma)

def entropy_integrand(x):
    px = dist.pdf(x)
    return -px * np.log(px) if px > 1e-300 else 0.0

H_numerical, _ = integrate.quad(entropy_integrand, -20, 20)
H_analytic = 0.5 * np.log(2 * np.pi * np.e * sigma**2)
ok = abs(H_numerical - H_analytic) < 1e-6
print(f"{'PASS' if ok else 'FAIL'} — H[N(0,σ²)]: numerical={H_numerical:.8f}, analytic={H_analytic:.8f}")

print('\n=== Cross-Entropy and NLL ===')
# H(p, q) = -E_p[log q] = H(p) + KL(p||q)
# For Gaussians:
mu1, s1 = 0.0, 1.0  # true p
mu2, s2 = 1.0, 1.5  # model q
p = stats.norm(mu1, s1)
q = stats.norm(mu2, s2)

def ce_integrand(x):
    px = p.pdf(x)
    qx = q.pdf(x)
    return -px * np.log(qx) if px > 1e-300 and qx > 1e-300 else 0.0

H_ce, _ = integrate.quad(ce_integrand, -15, 15)
H_p = 0.5 * np.log(2 * np.pi * np.e * s1**2)
from scipy.integrate import quad as q_int

def kl_int(x):
    px, qx = p.pdf(x), q.pdf(x)
    return px * np.log(px/qx) if px > 1e-300 and qx > 1e-300 else 0.0

KL, _ = q_int(kl_int, -15, 15)
H_ce_formula = H_p + KL
ok2 = abs(H_ce - H_ce_formula) < 1e-5
print(f"{'PASS' if ok2 else 'FAIL'} — Cross-entropy = H(p) + KL: CE={H_ce:.6f}, H_p+KL={H_ce_formula:.6f}")
print(f'  H(p)={H_p:.6f}, KL(p||q)={KL:.6f}')
print('Minimizing CE(p,q) over q = minimizing KL(p||q) = MLE!')

10. Integration in Deep Learning — Neural ODEs

A Neural ODE (Chen et al. 2018) defines a continuous-depth model:

\mathbf{h}(T) = \mathbf{h}(0) + \int_0^T f_\theta(\mathbf{h}(t), t)\,dt

where $f_\theta$ is a neural network. The forward pass is ODE integration; gradients are computed via the adjoint method (integration backwards in time) using $O(1)$ memory instead of $O(T)$ .

Code cell 27

# === 10.1 Neural ODE — Simple 1D Example ===

import numpy as np
from scipy.integrate import solve_ivp

# Simple 1D neural ODE: dh/dt = -a*h (exponential decay)
# Solution: h(T) = h(0) * exp(-a*T)
# This mimics a ResNet with infinitely many tiny residual blocks

a = 1.5  # learned parameter
h0 = 2.0
T = 1.0

def ode_func(t, h): return [-a * h[0]]

# Solve ODE = forward pass of Neural ODE
sol = solve_ivp(ode_func, [0, T], [h0], dense_output=True, rtol=1e-8)
h_T_ode = sol.y[0, -1]
h_T_true = h0 * np.exp(-a * T)

ok = abs(h_T_ode - h_T_true) < 1e-6
print(f"{'PASS' if ok else 'FAIL'} — ODE solution: h(T)={h_T_ode:.8f}, true={h_T_true:.8f}")

# Simulate ResNet discretization (Euler method with step size dt)
print('\nResNet (Euler) vs Neural ODE:')
for n_steps in [1, 2, 4, 8, 16, 32]:
    dt = T / n_steps
    h = h0
    for _ in range(n_steps):
        h += dt * (-a * h)  # Euler step = residual connection
    err = abs(h - h_T_true)
    print(f'  n_steps={n_steps:3d}: h(T)={h:.8f}, error={err:.2e}')
print(f'  ODE solver:   h(T)={h_T_ode:.8f}, error={abs(h_T_ode-h_T_true):.2e}')
print('\nNeural ODE = ResNet limit as n_steps → ∞, depth → ∞')

Code cell 28

# === 10.2 Adjoint Method — Backprop Through ODE ===

import numpy as np
from scipy.integrate import solve_ivp

# Verify adjoint gradient matches finite differences
# ODE: dh/dt = -a*h, h(0) = h0
# Loss: L = h(T)²/2
# Adjoint: da_dt = a * a (where a = dL/dh)
# Gradient: dL/da = ∫₀ᵀ a(t) * (-h(t)) dt (integral of adjoint × param sensitivity)

a_param = 1.5
h0_val = 2.0
T = 1.0

def h_solution(a, h0=h0_val, t_end=T):
    return h0 * np.exp(-a * t_end)

def loss(a):
    return h_solution(a)**2 / 2

# Analytic gradient: dL/da
# L = (h0*e^(-aT))²/2 => dL/da = -T * h0² * e^(-2aT)
def grad_analytic(a):
    return -T * h0_val**2 * np.exp(-2*a*T)

# Numerical gradient
def grad_numerical(a, h=1e-5):
    return (loss(a+h) - loss(a-h)) / (2*h)

print('Adjoint Gradient Verification:')
for a_test in [0.5, 1.0, 1.5, 2.0]:
    ga = grad_analytic(a_test)
    gn = grad_numerical(a_test)
    ok = abs(ga - gn) < 1e-5
    print(f"  {'PASS' if ok else 'FAIL'} a={a_test}: analytic={ga:.8f}, numerical={gn:.8f}")

print('\nThe adjoint method computes this via backward ODE integration')
print('= O(1) memory (vs O(T) for storing intermediate states)')

11. Attention Mechanism as Continuous Integration

Scaled dot-product attention can be viewed as a discrete approximation to:

\text{Attn}(q, K, V) = \int K(x) V(x) \, p(x | q)\,dx

where $p(x|q) \propto \exp(q \cdot k(x) / \sqrt{d})$ is a softmax distribution and the integral is approximated by summing over the $n$ key-value pairs. This motivates linear attention (kernel approximations) and continuous attention (over sequences as functions).

Code cell 30

# === 11.1 Attention as Discrete Integration (Expected Value) ===

import numpy as np

np.random.seed(42)

# Attention: output = sum_j softmax(q·K/√d)_j * V_j
# = E_{j~softmax}[V_j] = discrete expected value of V under softmax distribution

d = 8  # embedding dim
n = 10  # sequence length

q = np.random.randn(d)         # query
K = np.random.randn(n, d)      # keys
V = np.random.randn(n, d)      # values

# Standard attention
scores = K @ q / np.sqrt(d)    # (n,)
scores -= scores.max()          # numerical stability
weights = np.exp(scores)
weights /= weights.sum()        # softmax = probability distribution

# Attention output = weighted sum of V = discrete E[V]
attn_output = weights @ V       # (d,)

# Verify: this is exactly E_{j~softmax}[V_j]
E_V = sum(weights[j] * V[j] for j in range(n))
ok = np.allclose(attn_output, E_V)
print(f"{'PASS' if ok else 'FAIL'} — Attention = E_softmax[V]")
print(f'Attention weights (softmax probs): {weights}')
print(f'sum(weights) = {weights.sum():.8f} — verified probability distribution')

# Entropy of attention distribution
H_attn = -np.sum(weights * np.log(weights + 1e-300))
print(f'\nAttention entropy: {H_attn:.4f} nats')
print(f'Max possible (uniform): {np.log(n):.4f} nats')
print(f'Min possible (one-hot): 0')
print('\nHigh-entropy attention = attending broadly (integration)')
print('Low-entropy attention = attending sharply (evaluation at a point)')

12. VAE ELBO — Integration in Generative Models

The Variational Autoencoder objective (ELBO) is:

\mathcal{L}_{\text{ELBO}} = \underbrace{\mathbb{E}_{q_\phi(z|x)}[\ln p_\theta(x|z)]}_{\text{reconstruction}} - \underbrace{D_{KL}(q_\phi(z|x)\|p(z))}_{\text{KL penalty}}

Both terms are integrals. The reconstruction term is estimated via Monte Carlo; the KL term is computed analytically for Gaussian $q$ and prior $p$ .

Code cell 32

# === 12.1 VAE ELBO — Decomposition ===

import numpy as np
from scipy import stats

np.random.seed(42)

# Toy VAE: encoder q(z|x) = N(mu_phi(x), sigma_phi²(x))
# Prior: p(z) = N(0, I)
# Decoder: p(x|z) = N(mu_theta(z), I)

# Simulated encoder outputs for a mini-batch
batch_size = 64
z_dim = 4
x_dim = 8

# Encoder outputs
mu_phi    = 0.5 * np.random.randn(batch_size, z_dim)
log_sigma = -0.5 * np.ones((batch_size, z_dim))  # sigma=exp(-0.5)≈0.6
sigma_phi = np.exp(log_sigma)

# Reparameterization trick: z = mu + sigma * epsilon, epsilon ~ N(0,I)
epsilon = np.random.randn(batch_size, z_dim)
z_samples = mu_phi + sigma_phi * epsilon

# Decoder: mu_theta(z) = W*z + b (linear for demo)
W = np.random.randn(z_dim, x_dim) / np.sqrt(z_dim)
x_true = np.random.randn(batch_size, x_dim)
mu_theta = z_samples @ W

# Reconstruction term: E_q[log p(x|z)] = -||x - mu_theta||² / 2 + const
recon_loss = -0.5 * np.mean(np.sum((x_true - mu_theta)**2, axis=1))

# KL term: KL(N(mu_phi, sigma²) || N(0,I)) = sum_j [mu_j²/2 + sigma_j²/2 - log(sigma_j) - 1/2]
kl_per_dim = 0.5 * (mu_phi**2 + sigma_phi**2 - 2*log_sigma - 1)
kl_loss = np.mean(np.sum(kl_per_dim, axis=1))  # mean over batch

elbo = recon_loss - kl_loss

print('=== VAE ELBO Components ===')
print(f'Reconstruction term: E[log p(x|z)] = {recon_loss:.4f}')
print(f'KL term: KL(q||p)                  = {kl_loss:.4f}')
print(f'ELBO = recon - KL                  = {elbo:.4f}')

# KL must be ≥ 0 (Gibbs inequality)
print(f'\nKL ≥ 0: {kl_loss >= 0}')
print('KL close to 0 ↔ encoder close to prior (posterior collapse risk)')
print('KL large → encoder far from prior (effective latent usage)')
print('\nELBO = integral-based bound on log-likelihood log p(x)')

13. Score Matching and Diffusion Models

Diffusion models (DDPM, DDIM, score-based SDEs) train a neural network to estimate the score function $\nabla_x \ln p(x)$ . The training objective is:

\mathcal{L}_{\text{SM}} = \mathbb{E}_{x\sim p}\left[\|s_\theta(x) - \nabla_x \ln p(x)\|^2\right]

This expectation is an integral over the data distribution. The score is the gradient of the log-density — a key role of differentiation in integration.

Code cell 34

# === 13.1 Score Function of a Gaussian ===

import numpy as np
from scipy import stats, integrate

# For p(x) = N(mu, sigma²):
# score = d/dx log p(x) = -(x - mu) / sigma²

mu, sigma = 2.0, 1.5
dist = stats.norm(mu, sigma)

def score_analytic(x): return -(x - mu) / sigma**2

# Numerical: d/dx log p(x) via finite differences
def score_numerical(x, h=1e-5):
    return (np.log(dist.pdf(x+h) + 1e-300) - np.log(dist.pdf(x-h) + 1e-300)) / (2*h)

print('Score function s(x) = d/dx log p(x) for N(2, 1.5²):')
xs = np.array([-1.0, 0.0, 1.0, 2.0, 3.0, 4.0])
for x in xs:
    sa = score_analytic(x)
    sn = score_numerical(x)
    ok = abs(sa - sn) < 1e-4
    print(f"  {'PASS' if ok else 'FAIL'} x={x:5.1f}: analytic={sa:8.4f}, numerical={sn:8.4f}")

# Score = 0 at the mode
print(f'\nScore at mean x=mu: {score_analytic(mu):.8f} (should be 0)')
print('Score points TOWARD the mean — this is what diffusion uses to denoise')

# Denoising: x_noisy = x_clean + sigma_noise * epsilon
# Optimal denoiser: E[x_clean | x_noisy] = x_noisy + sigma_noise² * score(x_noisy)
print('\nDiffusion: score function guides noisy sample back to clean data distribution')

Code cell 35

# === 13.2 Monte Carlo Estimation of Expected Loss ===

import numpy as np
from scipy import stats

# Training loss = E_p[L(x, theta)]
# = ∫ L(x, theta) p(x) dx  [integral over data distribution]
# Approximated by Monte Carlo: (1/n) sum_i L(x_i, theta)

np.random.seed(42)

# Simple regression: p(x) = N(0,1), L(x, theta) = (x - theta)²
# E[L] = E[(X-theta)²] = Var[X] + (E[X]-theta)² = 1 + theta²
# Minimized at theta=0 with E[L]=1

theta = 0.5
true_loss = 1 + theta**2  # analytic

n_samples_list = [10, 100, 1000, 10000, 100000]
print(f'True expected loss: {true_loss:.6f}')
print(f'{'n':>8} | {'MC estimate':>12} | {'Error':>10} | {'Std err':>10}')
for n in n_samples_list:
    x_samples = np.random.randn(n)
    losses = (x_samples - theta)**2
    mc_est = np.mean(losses)
    std_err = np.std(losses) / np.sqrt(n)
    err = abs(mc_est - true_loss)
    print(f'{n:8d} | {mc_est:12.6f} | {err:10.6f} | {std_err:10.6f}')

print('\nMC error ~ O(1/√n) — matches theoretical CLT bound')
print('This IS mini-batch SGD: each mini-batch is a MC estimate of E[∇L]')

14. Gaussian Quadrature — Optimal Node Selection

Gauss-Legendre quadrature selects $n$ nodes $x_1,\ldots,x_n$ and weights $w_1,\ldots,w_n$ to exactly integrate any polynomial of degree $\le 2n-1$ :

\int_{-1}^1 f(x)\,dx \approx \sum_{i=1}^n w_i f(x_i)

Nodes are roots of Legendre polynomials $P_n(x)$ . Error: $O(h^{2n})$ for $n$ nodes — exponentially better than trapezoid.

Code cell 37

# === 14.1 Gaussian Quadrature vs Simpson's ===

import numpy as np
from numpy.polynomial.legendre import leggauss
from scipy import integrate, special

def gauss_legendre(f, a, b, n):
    """n-point Gauss-Legendre quadrature on [a,b]."""
    nodes, weights = leggauss(n)
    # Transform from [-1,1] to [a,b]
    xs = 0.5*(b-a)*nodes + 0.5*(b+a)
    ws = 0.5*(b-a)*weights
    return np.dot(ws, f(xs))

def simpsons(f, a, b, n=100):
    if n % 2 != 0: n += 1
    xs = np.linspace(a, b, n+1)
    h = (b-a)/n
    coeffs = np.ones(n+1); coeffs[1:-1:2]=4; coeffs[2:-2:2]=2
    return (h/3)*np.dot(coeffs, f(xs))

a, b = 0.0, 2.0
f = lambda x: np.exp(-x**2)
true_val = np.sqrt(np.pi)/2 * special.erf(2)

print(f'True ∫₀² e^(-x²) dx = {true_val:.12f}')
print(f'{'Method':>20} | {'n':>4} | {'Estimate':>16} | {'Error':>10}')

for n in [2, 3, 5, 8, 10]:
    gl = gauss_legendre(f, a, b, n)
    err = abs(gl - true_val)
    print(f"{'Gauss-Legendre':>20} | {n:4d} | {gl:16.12f} | {err:10.2e}")

for n in [10, 50, 100, 500]:
    simp = simpsons(f, a, b, n)
    err = abs(simp - true_val)
    print(f"{'Simpsons':>20} | {n:4d} | {simp:16.12f} | {err:10.2e}")

print('\n5-point Gauss-Legendre beats 500-point Simpson\'s!')
print('Reason: Gauss nodes are OPTIMAL (not equally spaced)')

15. Trigonometric Substitution

For integrands containing square roots, trig substitutions eliminate them:

Expression	Substitution	Identity used
$\sqrt{a^2 - x^2}$	$x = a\sin\theta$	$1-\sin^2=\cos^2$
$\sqrt{a^2 + x^2}$	$x = a\tan\theta$	$1+\tan^2=\sec^2$
$\sqrt{x^2 - a^2}$	$x = a\sec\theta$	$\sec^2-1=\tan^2$

Code cell 39

# === 15.1 Trig Substitution — Area of Semicircle ===

import numpy as np
from scipy import integrate

# ∫₀¹ √(1-x²) dx = π/4 (area of quarter circle)
# x = sin(θ), dx = cos(θ)dθ, √(1-x²) = cos(θ)
# ∫₀^(π/2) cos²(θ) dθ = π/4

# Original form
val1, _ = integrate.quad(lambda x: np.sqrt(1 - x**2), 0, 1)
ok1 = abs(val1 - np.pi/4) < 1e-8
print(f"{'PASS' if ok1 else 'FAIL'} — ∫₀¹ √(1-x²) dx = π/4 = {np.pi/4:.8f}, got {val1:.8f}")

# After trig substitution: same integral in θ-form
val2, _ = integrate.quad(lambda theta: np.cos(theta)**2, 0, np.pi/2)
ok2 = abs(val2 - np.pi/4) < 1e-8
print(f"{'PASS' if ok2 else 'FAIL'} — ∫₀^(π/2) cos²(θ) dθ = π/4 = {val2:.8f}")

# ML Application: normalizing constant of von Mises distribution
# p(θ;κ) ∝ exp(κ cos θ) — circular distribution for angles
# Used in positional encoding analysis
print('\n=== von Mises Normalization Constant ===')
from scipy.special import i0  # modified Bessel function
for kappa in [0.0, 0.5, 1.0, 2.0, 5.0]:
    Z_numerical, _ = integrate.quad(
        lambda theta: np.exp(kappa * np.cos(theta)), 0, 2*np.pi)
    Z_analytic = 2 * np.pi * i0(kappa)
    ok = abs(Z_numerical - Z_analytic) / Z_analytic < 1e-6
    print(f"  {'PASS' if ok else 'FAIL'} κ={kappa:.1f}: Z={Z_numerical:.6f} (2π·I₀(κ)={Z_analytic:.6f})")

16. Policy Gradients — REINFORCE in Detail

The REINFORCE gradient estimator:

\nabla_\theta \mathbb{E}_{\tau\sim p_\theta}[R(\tau)] = \mathbb{E}_{\tau\sim p_\theta}\left[R(\tau)\,\nabla_\theta \ln p_\theta(\tau)\right]

is derived by differentiating under the integral sign (Leibniz rule) and using the log-derivative trick $\nabla_\theta p_\theta = p_\theta \nabla_\theta \ln p_\theta$ .

Code cell 41

# === 16.1 REINFORCE Gradient — Bandit Demo ===

import numpy as np

np.random.seed(42)

# 2-armed bandit: theta = [logit_arm1, logit_arm2]
# R(a=0)=1, R(a=1)=3 — we want to find theta maximizing E[R]

def softmax(logits):
    e = np.exp(logits - logits.max())
    return e / e.sum()

theta = np.array([0.0, 0.0])  # start uniform
lr = 0.05
rewards = np.array([1.0, 3.0])

# Analytic: E[R] = pi_0 * 1 + pi_1 * 3 = 1 + 2*pi_1
# Maximized at pi_1 = 1 (all mass on arm 1)
# REINFORCE gradient: E[R * score]

loss_history = []
for step in range(300):
    pi = softmax(theta)
    # Sample action
    a = np.random.choice(2, p=pi)
    r = rewards[a]
    # Score function: d/dtheta log pi(a|theta)
    # For softmax: score = e_a - pi  (indicator - prob vector)
    score = np.zeros(2)
    score[a] = 1.0
    score -= pi
    # REINFORCE update (gradient ascent)
    theta += lr * r * score
    loss_history.append(np.dot(softmax(theta), rewards))

final_pi = softmax(theta)
final_E_R = np.dot(final_pi, rewards)
print(f'Final policy: pi = [{final_pi[0]:.4f}, {final_pi[1]:.4f}]')
print(f'Final E[R] = {final_E_R:.4f} (max possible: 3.0)')

ok = final_E_R > 2.8
print(f"{'PASS' if ok else 'FAIL'} — REINFORCE converged to near-optimal policy")

print('\nThe gradient ∇E[R] was computed purely from SAMPLES via the score function')
print('No analytic formula for p(tau) needed — just log p(a|theta)')

17. Importance Sampling — Change of Measure

To estimate $\mathbb{E}_p[f(x)]$ using samples from $q \ne p$ :

\mathbb{E}_p[f(x)] = \int f(x)\,p(x)\,dx = \int f(x)\,\frac{p(x)}{q(x)}\,q(x)\,dx = \mathbb{E}_q\left[f(x)\,\frac{p(x)}{q(x)}\right]

The ratio $w(x) = p(x)/q(x)$ is the importance weight. Used in off-policy RL (PPO clipping), self-normalized IS, and AIS.

Code cell 43

# === 17.1 Importance Sampling — Rare Event Estimation ===

import numpy as np
from scipy import stats

np.random.seed(42)

# Estimate P(X > 4) for X ~ N(0,1)
# True value: 1 - Phi(4) ≈ 3.167e-5 (rare event)
from scipy.stats import norm
true_prob = norm.sf(4)  # survival function
print(f'True P(X > 4) = {true_prob:.2e}')

n = 100000

# Naive MC: sample X ~ N(0,1), compute fraction > 4
x_naive = np.random.randn(n)
mc_naive = np.mean(x_naive > 4)
print(f'Naive MC: {mc_naive:.2e} (expected ~{true_prob:.2e})')

# Importance sampling: proposal q = N(4.5, 1) (shifted toward rare region)
mu_q = 4.5
x_is = np.random.randn(n) + mu_q  # sample from q
# Importance weights: p(x)/q(x) = exp(-x²/2) / exp(-(x-mu_q)²/2)
log_w = (-x_is**2/2) - (-(x_is-mu_q)**2/2)
w = np.exp(log_w)
# IS estimate: E_q[1(x>4) * w(x)]
mc_is = np.mean((x_is > 4).astype(float) * w)
print(f'IS MC:    {mc_is:.2e} (proposal q=N({mu_q},1))')

ok_naive = mc_naive == 0 or abs(mc_naive - true_prob)/true_prob < 0.5
ok_is = abs(mc_is - true_prob)/true_prob < 0.01
print(f"\n{'PASS' if ok_is else 'FAIL'} — IS estimate accurate to 1%")
print(f"Naive MC needs ~{int(1/true_prob):,} samples to see ONE event")
print('IS concentrates samples where the event occurs => huge variance reduction')

18. Error Analysis — Richardson Extrapolation

Richardson extrapolation eliminates the leading error term to get higher-order accuracy:

I = T_h - \frac{h^2}{12}(f'(b)-f'(a)) + O(h^4)

Two trapezoid estimates with step $h$ and $h/2$ combine to cancel the $O(h^2)$ term:

I \approx \frac{4T_{h/2} - T_h}{3} + O(h^4) \quad \text{(= Simpson's rule!)}

Code cell 45

# === 18.1 Richardson Extrapolation ===

import numpy as np
from scipy import integrate, special

f = lambda x: np.exp(-x**2)
a, b = 0.0, 2.0
true_val = np.sqrt(np.pi)/2 * special.erf(2)

def trap(n):
    xs = np.linspace(a, b, n+1)
    h = (b-a)/n
    return h * (f(xs[0])/2 + np.sum(f(xs[1:-1])) + f(xs[-1])/2)

print('Richardson Extrapolation:')
print(f'{'n':>6} | {'T_n':>16} | {'Richardson':>16} | {'Simp error':>12}')

for n in [4, 8, 16, 32, 64]:
    Th  = trap(n)
    Th2 = trap(2*n)
    # Richardson: (4*T_{h/2} - T_h) / 3 = Simpson's
    Richardson = (4*Th2 - Th) / 3
    err_R = abs(Richardson - true_val)
    err_T = abs(Th - true_val)
    print(f'{n:6d} | {Th:16.12f} | {Richardson:16.12f} | {err_R:12.2e}')

print('\nRichardson extrapolation = Simpson\'s rule!')
print('Idea extends to Romberg integration (repeated extrapolation)')

19. Summary — Integration in ML

Concept	ML Application	Section
Riemann sums	Mini-batch as sample average	§2
FTC (Part 2)	Computing definite losses and probabilities	§3
u-substitution	Change of variables in normalizing flows	§4
Integration by parts	REINFORCE log-derivative trick	§5
Improper integrals	Normalization of Gaussian ( $\sqrt\pi$ ), KL	§7
Monte Carlo	SGD = stochastic gradient = MC integration	§8
Probability integrals	Expected loss, KL, cross-entropy, entropy	§9
Neural ODEs	Continuous-depth networks, adjoint backprop	§10
Attention as expectation	Softmax as discrete probability, kernel attention	§11
VAE ELBO	Reconstruction + KL = two integrals	§12
Score matching	Diffusion models train score $\nabla\ln p$	§13
Gaussian quadrature	Optimal numerical integration	§14
Importance sampling	Off-policy RL, rare-event estimation	§17

Key takeaway: Integration is not just a calculus technique — it is the mathematical language of probability, expected values, and the training objectives of modern ML. Every loss function you minimize is an integral.

Code cell 47

# === 19. Final Verification Suite ===

import numpy as np
from scipy import integrate, stats, special

results = []

# 1. FTC: ∫₀^π sin(x) = 2
v, _ = integrate.quad(np.sin, 0, np.pi)
results.append(('FTC: ∫₀^π sin(x)dx = 2', np.allclose(v, 2.0)))

# 2. Gaussian integral: ∫_{-∞}^∞ e^(-x²) = √π
v, _ = integrate.quad(lambda x: np.exp(-x**2), -50, 50)
results.append(('Gaussian: ∫e^(-x²)dx = √π', np.allclose(v, np.sqrt(np.pi), atol=1e-6)))

# 3. Normal PDF normalizes to 1
v, _ = integrate.quad(stats.norm.pdf, -50, 50)
results.append(('Normal PDF integrates to 1', np.allclose(v, 1.0, atol=1e-8)))

# 4. KL(N(1,1)||N(0,1)) = 0.5
def kl_integrand(x):
    p = stats.norm(1,1).pdf(x); q = stats.norm(0,1).pdf(x)
    return p*np.log(p/q) if p>1e-300 and q>1e-300 else 0.0
v, _ = integrate.quad(kl_integrand, -20, 20)
results.append(('KL(N(1,1)||N(0,1)) = 0.5', np.allclose(v, 0.5, atol=1e-5)))

# 5. p-integral: ∫₁^∞ x⁻² dx = 1
v, _ = integrate.quad(lambda x: x**(-2), 1, np.inf)
results.append(('∫₁^∞ x⁻² dx = 1', np.allclose(v, 1.0, atol=1e-8)))

# 6. Entropy of N(0,1) = 0.5*ln(2πe)
dist = stats.norm(0,1)
def h_int(x): p=dist.pdf(x); return -p*np.log(p) if p>1e-300 else 0.0
v, _ = integrate.quad(h_int, -20, 20)
expected_H = 0.5*np.log(2*np.pi*np.e)
results.append(('Entropy N(0,1) = 0.5·ln(2πe)', np.allclose(v, expected_H, atol=1e-6)))

print('=== Final Verification Suite ===')
for name, ok in results:
    print(f"{'PASS' if ok else 'FAIL'} — {name}")
n_pass = sum(ok for _, ok in results)
print(f'\n{n_pass}/{len(results)} passed.')
print('\nIntegration: the engine beneath probability, statistics, and ML.')

20. Special Integrals — Gamma and Beta Functions

\Gamma(n) = \int_0^\infty x^{n-1}e^{-x}\,dx \qquad \Gamma(n+1) = n! \text{ for integer } n

B(a,b) = \int_0^1 x^{a-1}(1-x)^{b-1}\,dx = \frac{\Gamma(a)\Gamma(b)}{\Gamma(a+b)}

These appear in Dirichlet distributions (Bayesian topic models, Dirichlet process priors in LLM fine-tuning), Beta distributions, and Bayesian inference broadly.

Code cell 49

# === 20.1 Gamma and Beta Functions ===

import numpy as np
from scipy import integrate, special

print('=== Gamma Function ===')
# Gamma(n) = (n-1)! for positive integers
for n in [1, 2, 3, 4, 5, 6]:
    G_numerical, _ = integrate.quad(lambda x: x**(n-1) * np.exp(-x), 0, np.inf)
    G_analytic = special.gamma(n)
    G_factorial = float(math.factorial(n-1)) if n <= 10 else float('inf')
    ok = abs(G_numerical - G_analytic) / G_analytic < 1e-6
    print(f"  {'PASS' if ok else 'FAIL'} Γ({n}) = {G_numerical:.6f} = {int(G_factorial)}!")

# Γ(1/2) = √π
G_half, _ = integrate.quad(lambda x: x**(-0.5) * np.exp(-x), 0, np.inf)
ok = abs(G_half - np.sqrt(np.pi)) < 1e-6
print(f"\n{'PASS' if ok else 'FAIL'} Γ(1/2) = √π = {np.sqrt(np.pi):.8f}, got {G_half:.8f}")

print('\n=== Beta Function ===')
# B(a,b) = Γ(a)Γ(b)/Γ(a+b)
test_cases = [(2.0, 3.0), (0.5, 0.5), (1.0, 1.0), (3.0, 2.0)]
for a, b in test_cases:
    B_num, _ = integrate.quad(lambda x: x**(a-1) * (1-x)**(b-1), 0, 1)
    B_formula = special.gamma(a)*special.gamma(b)/special.gamma(a+b)
    ok = abs(B_num - B_formula) / B_formula < 1e-6
    print(f"  {'PASS' if ok else 'FAIL'} B({a},{b}) = {B_num:.8f} = Γ(a)Γ(b)/Γ(a+b) = {B_formula:.8f}")

print('\nDirichlet distribution uses B(alpha) as normalizing constant')
print('Bayesian LM: Dirichlet prior on token probabilities')

21. Convolution as Integration

The convolution of two functions:

( f * g)(t) = \int_{-\infty}^\infty f(\tau)\,g(t-\tau)\,d\tau

Convolution is the foundation of CNN filters (discrete case), signal processing, and the convolution theorem connecting to Fourier transforms.

Code cell 51

# === 21.1 Convolution — Gaussian Self-Convolution ===

import numpy as np
from scipy import stats, signal

# N(0,1) * N(0,1) = N(0,2) (sum of independent normals)
# Proof via convolution integral

# Numerical convolution via scipy
x = np.linspace(-10, 10, 2001)
dx = x[1] - x[0]
f1 = stats.norm(0, 1).pdf(x)
f2 = stats.norm(0, 1).pdf(x)

# Discrete convolution approximating the integral
conv = np.convolve(f1, f2, mode='same') * dx

# True result: N(0, √2)
true_conv = stats.norm(0, np.sqrt(2)).pdf(x)

# Compare at a few points
print('Convolution N(0,1)*N(0,1) = N(0,√2):')
for x_val in [-2.0, -1.0, 0.0, 1.0, 2.0]:
    idx = np.argmin(abs(x - x_val))
    conv_val = conv[idx]
    true_val = true_conv[idx]
    ok = abs(conv_val - true_val) < 0.01
    print(f"  {'PASS' if ok else 'FAIL'} x={x_val:5.1f}: conv={conv_val:.6f}, true={true_val:.6f}")

# In CNNs: discrete convolution = applying learned filters
print('\nCNN filter = discrete approximation to continuous convolution integral')
print('Gaussian blur kernel = discretized N(0,σ²) convolution')

22. Numerical Stability in Integration

Floating-point errors accumulate in numerical integration. Key issues:

Cancellation error: subtracting nearly equal numbers loses precision
Log-sum-exp trick: compute $\log\sum e^{x_i}$ stably by subtracting max
Kahan summation: compensated summation reduces floating-point error from $O(n\epsilon)$ to $O(\epsilon)$

Code cell 53

# === 22.1 Log-Sum-Exp and Numerical Stability ===

import numpy as np

# Log-sum-exp: log(sum_i exp(x_i)) — naive overflow
def logsumexp_naive(x):
    return np.log(np.sum(np.exp(x)))

def logsumexp_stable(x):
    m = np.max(x)
    return m + np.log(np.sum(np.exp(x - m)))

x_small = np.array([1.0, 2.0, 3.0])
x_large = np.array([1000.0, 1001.0, 1002.0])  # naive overflows
x_neg   = np.array([-1000.0, -1001.0, -1002.0])  # naive underflows to 0

from scipy.special import logsumexp as scipy_lse

print('Log-Sum-Exp Stability:')
for name, x in [('small', x_small), ('large', x_large), ('negative', x_neg)]:
    naive = logsumexp_naive(x)
    stable = logsumexp_stable(x)
    scipy_val = scipy_lse(x)
    ok = np.isfinite(stable) and abs(stable - scipy_val) < 1e-8
    print(f"  {'PASS' if ok else 'FAIL'} {name}: naive={naive}, stable={stable:.6f}, scipy={scipy_val:.6f}")

# Kahan summation
def kahan_sum(xs):
    s, c = 0.0, 0.0
    for x in xs:
        y = x - c
        t = s + y
        c = (t - s) - y
        s = t
    return s

print('\nKahan summation vs naive:')
# Sum of 1/n for n=1..N — compare with true harmonic number
N = 1000000
xs = np.array([1/n for n in range(1, N+1)])
naive_sum = float(np.sum(xs))
kahan = kahan_sum(xs)
# True: H_N ≈ ln(N) + gamma
euler_gamma = 0.5772156649
true_approx = np.log(N) + euler_gamma
print(f'  Naive:  {naive_sum:.10f}')
print(f'  Kahan:  {kahan:.10f}')
print(f'  ~True:  {true_approx:.10f} (Euler-Mascheroni estimate)')