Theory Notebook
Converted from
theory.ipynbfor web reading.
Integration — Theory Notebook
"To integrate is to make whole — to recover from the rate of change the quantity that changed."
Interactive exploration of integration: Riemann sums, the Fundamental Theorem, techniques, numerical methods, and connections to probability and machine learning.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import math
import numpy as np
import numpy.linalg as la
from scipy import integrate, special, stats
from math import factorial
import matplotlib.patches as patches
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def check_true(name, cond):
ok = bool(cond)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
return ok
def centered_diff(f, x, h=1e-6):
return (f(x + h) - f(x - h)) / (2 * h)
def forward_diff(f, x, h=1e-6):
return (f(x + h) - f(x)) / h
def backward_diff(f, x, h=1e-6):
return (f(x) - f(x - h)) / h
def grad_check(f, x, analytic_grad, h=1e-6):
x = np.asarray(x, dtype=float)
analytic_grad = np.asarray(analytic_grad, dtype=float)
numeric_grad = np.zeros_like(x, dtype=float)
for idx in np.ndindex(x.shape):
x_plus = x.copy(); x_minus = x.copy()
x_plus[idx] += h; x_minus[idx] -= h
numeric_grad[idx] = (f(x_plus) - f(x_minus)) / (2 * h)
denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
return la.norm(analytic_grad - numeric_grad) / denom
def check(name, got, expected, tol=1e-8):
return check_close(name, got, expected, tol=tol)
print("Chapter helper setup complete.")
1. Intuition — Integration as Accumulation
The integral answers: how much total change accumulates when the rate is ? It is simultaneously:
- Area under the curve (geometry)
- Accumulated change given a rate (physics)
- Expected value of for (probability)
- Infinite sum of infinitesimal slices (limiting process)
Code cell 5
# === 1.1 Integration as Area — Visual Demo ===
import numpy as np
def f(x): return np.sin(x) + 1.5 # positive function for clarity
a, b = 0, np.pi
true_area, _ = integrate.quad(f, a, b)
print(f'True area under sin(x)+1.5 on [0,π]: {true_area:.6f}')
print(f'Expected: π*1.5 + 2 = {np.pi*1.5 + 2:.6f}') # ∫sin = 2, ∫1.5 = 1.5π
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
x_fine = np.linspace(a, b, 400)
n_list = [4, 8, 32]
for ax, n in zip(axes, n_list):
xs = np.linspace(a, b, n+1)
h = (b-a)/n
riemann = sum(f(xs[i]) * h for i in range(n))
ax.fill_between(x_fine, f(x_fine), alpha=0.3, color='steelblue', label='True area')
ax.plot(x_fine, f(x_fine), 'k-', lw=2)
for i in range(n):
rect = patches.Rectangle((xs[i], 0), h, f(xs[i]),
linewidth=0.5, edgecolor='navy', facecolor='orange', alpha=0.5)
ax.add_patch(rect)
ax.set_title(f'n={n}: R_n = {riemann:.4f}')
ax.set_xlabel('x'); ax.set_ylabel('f(x)')
plt.suptitle('Left Riemann Sums Converging to True Area', fontsize=14)
plt.tight_layout(); plt.show()
print(f'True value: {true_area:.6f}')
2. Riemann Sums — The Formal Definition
For a partition and sample points :
Left sums: ; Right sums: ; Midpoint sums: .
Code cell 7
# === 2.1 Convergence Rates of Riemann Sums ===
import numpy as np
def f(x): return np.exp(-x**2) # Gaussian: no closed form integral
a, b = 0.0, 2.0
true_val = np.sqrt(np.pi)/2 * special.erf(2) # erf(2) ≈ 0.9953
ns = [4, 8, 16, 32, 64, 128, 256, 512]
errors_left, errors_mid, errors_right = [], [], []
for n in ns:
h = (b-a)/n
xs = np.linspace(a, b, n+1)
left = h * np.sum(f(xs[:-1]))
right = h * np.sum(f(xs[1:]))
mid = h * np.sum(f((xs[:-1] + xs[1:])/2))
errors_left.append(abs(left - true_val))
errors_right.append(abs(right - true_val))
errors_mid.append(abs(mid - true_val))
print(f'True ∫₀² e^(-x²) dx = {true_val:.8f}')
print(f'{'n':>6} | {'Left err':>12} | {'Mid err':>12} | {'Right err':>12}')
for i, n in enumerate(ns):
print(f'{n:6d} | {errors_left[i]:12.2e} | {errors_mid[i]:12.2e} | {errors_right[i]:12.2e}')
# Log-log slope: should be ~1 for left/right, ~2 for midpoint
log_ns = np.log(ns)
slope_left = np.polyfit(log_ns, np.log(errors_left), 1)[0]
slope_mid = np.polyfit(log_ns, np.log(errors_mid), 1)[0]
print(f'\nConvergence order — Left: {slope_left:.2f}, Midpoint: {slope_mid:.2f}')
print('(Expected: Left/Right ~ O(h^1) = -1, Midpoint ~ O(h^2) = -2)')
3. Fundamental Theorem of Calculus
Part 1 (Differentiation of accumulation functions):
Part 2 (Evaluation via antiderivatives):
The FTC bridges differentiation and integration — they are inverse operations.
Code cell 9
# === 3.1 FTC Part 1 — Accumulation Function ===
import numpy as np
# F(x) = ∫₀ˣ cos(t) dt = sin(x)
# FTC Part 1: F'(x) = cos(x)
xs = np.linspace(0, 2*np.pi, 200)
# Numerical accumulation function
F_numerical = np.array([integrate.quad(np.cos, 0, x)[0] for x in xs])
F_analytic = np.sin(xs)
check('F(x) = ∫₀ˣ cos(t)dt = sin(x)', F_numerical, F_analytic, tol=1e-8)
# FTC Part 1: derivative of F_numerical ≈ cos(x)
h = xs[1] - xs[0]
dF_numerical = np.gradient(F_numerical, h)
cos_xs = np.cos(xs)
interior = slice(5, -5) # avoid boundary effects
check('d/dx[∫₀ˣ cos] ≈ cos(x)', dF_numerical[interior], cos_xs[interior], tol=1e-4)
print('\n=== FTC Part 2: ∫₀^π sin(x) dx = 2 ===')
result, _ = integrate.quad(np.sin, 0, np.pi)
check('∫₀^π sin(x) dx = 2', result, 2.0)
print('\nKey antiderivatives verified:')
tests = [
('∫x²dx from 0 to 1 = 1/3', lambda x: x**2, 0, 1, 1/3),
('∫eˣdx from 0 to 1 = e-1', np.exp, 0, 1, np.e-1),
('∫1/x dx from 1 to e = 1', lambda x: 1/x, 1, np.e, 1.0),
]
for name, fn, a, b, expected in tests:
val, _ = integrate.quad(fn, a, b)
check(name, val, expected)
Code cell 10
# === 3.2 Leibniz Rule — Differentiating Under the Integral Sign ===
import numpy as np
# F(a) = ∫₀^a e^(-ax²) dx
# dF/da via Leibniz: d/da ∫₀^a e^(-at²)dt
# = e^{-a·a²} * 1 + ∫₀^a ∂/∂a [e^{-at²}] dt
# = e^{-a³} + ∫₀^a (-t²)e^{-at²} dt
def F_leibniz(a_val):
return integrate.quad(lambda t: np.exp(-a_val * t**2), 0, a_val)[0]
def dF_leibniz_analytic(a_val):
# Leibniz: upper limit term + integral of derivative
upper_term = np.exp(-a_val**3)
integral_term = integrate.quad(
lambda t: -t**2 * np.exp(-a_val * t**2), 0, a_val)[0]
return upper_term + integral_term
def dF_numerical(a_val, h=1e-6):
return (F_leibniz(a_val+h) - F_leibniz(a_val-h)) / (2*h)
print('Leibniz Rule Verification:')
for a_val in [0.5, 1.0, 1.5, 2.0]:
analytic = dF_leibniz_analytic(a_val)
numerical = dF_numerical(a_val)
ok = abs(analytic - numerical) < 1e-5
print(f" {'PASS' if ok else 'FAIL'} a={a_val}: analytic={analytic:.6f}, numerical={numerical:.6f}")
print('\nLeibniz rule: d/da ∫_{g(a)}^{h(a)} f(x,a)dx = f(h(a),a)h\'(a) - f(g(a),a)g\'(a) + ∫∂f/∂a dx')
4. u-Substitution
The reverse chain rule: if the integrand has the form , set :
For definite integrals, transform the limits: , .
Code cell 12
# === 4.1 u-Substitution Examples ===
import numpy as np
print('=== u-Substitution Verification ===')
# (a) ∫ 2x·cos(x²) dx = sin(x²) + C
# u = x², du = 2x dx => ∫cos(u)du = sin(u) = sin(x²)
def f_a(x): return 2*x * np.cos(x**2)
antideriv_a = lambda x: np.sin(x**2)
# Verify: derivative of antiderivative = integrand
x_test = 1.3
d_antideriv = (antideriv_a(x_test+1e-7) - antideriv_a(x_test-1e-7)) / 2e-7
check('d/dx[sin(x²)] = 2x·cos(x²)', d_antideriv, f_a(x_test))
# (b) ∫₀¹ x·e^(x²) dx — definite integral with limits
# u = x², du = 2x dx => (1/2)∫₀¹ e^u du = (1/2)(e-1)
val_b, _ = integrate.quad(lambda x: x*np.exp(x**2), 0, 1)
expected_b = (np.e - 1)/2
check('∫₀¹ x·e^(x²) dx = (e-1)/2', val_b, expected_b)
# (c) ∫ sin³(x) dx — repeated substitution
# = ∫ sin²(x)·sin(x) dx = ∫(1-cos²x)·sin(x) dx
# u = cos(x), du = -sin(x) dx => -∫(1-u²)du = -u + u³/3 = -cos(x)+cos³(x)/3
def antideriv_c(x): return -np.cos(x) + np.cos(x)**3/3
def f_c(x): return np.sin(x)**3
d_c = (antideriv_c(x_test+1e-7) - antideriv_c(x_test-1e-7)) / 2e-7
check('d/dx[-cos(x)+cos³(x)/3] = sin³(x)', d_c, f_c(x_test))
# (d) Normalizing flows change-of-variables
# If z = g(x), then p_X(x) = p_Z(g(x))|g'(x)|
# (1D version — full Jacobian in multivariable)
print('\n--- Normalizing Flow change-of-variables ---')
def g(x): return x**3 # invertible for all x
def g_inv(z): return np.cbrt(z)
def g_prime(x): return 3*x**2
# Standard normal Z => transform to X = g⁻¹(Z)
from scipy.stats import norm
x_val = 1.5
z_val = g(x_val)
p_Z = norm.pdf(z_val)
p_X = p_Z * abs(g_prime(x_val)) # change of variables
print(f'x={x_val}, z=g(x)={z_val:.4f}')
print(f'p_Z(z) = {p_Z:.6f}, |g\'(x)| = {g_prime(x_val):.4f}')
print(f'p_X(x) = p_Z(g(x))|g\'(x)| = {p_X:.6f}')
5. Integration by Parts
The reverse product rule:
LIATE order for choosing : Logarithms, Inverse trig, Algebraic, Trig, Exponential (choose from leftmost category present).
ML connection: The REINFORCE gradient estimator uses integration by parts (the log-derivative trick) to compute gradients of expected values.
Code cell 14
# === 5.1 Integration by Parts Examples ===
import numpy as np
print('=== Integration by Parts Verification ===')
# (a) ∫ x·eˣ dx = xeˣ - eˣ + C = eˣ(x-1) + C
def antideriv_a(x): return np.exp(x) * (x - 1)
def f_a(x): return x * np.exp(x)
x_test = 1.5
d_a = (antideriv_a(x_test+1e-7) - antideriv_a(x_test-1e-7)) / 2e-7
check('d/dx[eˣ(x-1)] = x·eˣ', d_a, f_a(x_test))
# (b) ∫ ln(x) dx = x·ln(x) - x + C
def antideriv_b(x): return x*np.log(x) - x
def f_b(x): return np.log(x)
x_test = 2.0
d_b = (antideriv_b(x_test+1e-7) - antideriv_b(x_test-1e-7)) / 2e-7
check('d/dx[x·ln(x)-x] = ln(x)', d_b, f_b(x_test))
# (c) ∫ eˣ·sin(x) dx — cyclic trick
# Let I = ∫eˣ·sin(x) dx
# IBP twice: I = eˣ·sin(x) - eˣ·cos(x) - I => 2I = eˣ(sin(x)-cos(x))
def antideriv_c(x): return np.exp(x)*(np.sin(x)-np.cos(x))/2
def f_c(x): return np.exp(x)*np.sin(x)
d_c = (antideriv_c(x_test+1e-7) - antideriv_c(x_test-1e-7)) / 2e-7
check('d/dx[eˣ(sin-cos)/2] = eˣ·sin(x)', d_c, f_c(x_test))
# (d) REINFORCE: ∇_θ E_pθ[f(x)] = E_pθ[f(x)·∇_θ log pθ(x)]
print('\n--- REINFORCE gradient estimator demo ---')
# Bernoulli: pθ(1)=θ, pθ(0)=1-θ; f(x) = x (reward)
# Analytic gradient: d/dθ E[X] = d/dθ θ = 1
# REINFORCE estimate: E[f(x) * ∇log pθ(x)]
theta = 0.4
n_samples = 100000
np.random.seed(42)
x_samples = (np.random.rand(n_samples) < theta).astype(float)
log_grad = np.where(x_samples == 1, 1/theta, -1/(1-theta))
reinforce_grad = np.mean(x_samples * log_grad)
print(f'REINFORCE gradient estimate: {reinforce_grad:.4f} (true: 1.0)')
check('REINFORCE ≈ 1.0', reinforce_grad, 1.0, tol=0.02)
6. Partial Fractions
Rational functions can be split into simpler fractions:
- Distinct linear factors:
- Repeated linear factors:
- Irreducible quadratics:
Code cell 16
# === 6.1 Partial Fractions — Numerical Verification ===
import numpy as np
from scipy import integrate
# ∫ 1/((x-1)(x+2)) dx
# PFD: 1/((x-1)(x+2)) = A/(x-1) + B/(x+2)
# A = 1/(1+2) = 1/3, B = 1/(-2-1) = -1/3
# => (1/3)∫1/(x-1)dx - (1/3)∫1/(x+2)dx = (1/3)ln|x-1| - (1/3)ln|x+2| + C
def f_original(x): return 1/((x-1)*(x+2))
def antideriv(x): return (np.log(abs(x-1)) - np.log(abs(x+2)))/3
# Verify antiderivative
x_test = 2.5
d_num = (antideriv(x_test+1e-7) - antideriv(x_test-1e-7)) / 2e-7
ok = abs(d_num - f_original(x_test)) < 1e-5
print(f"{'PASS' if ok else 'FAIL'} — antiderivative of 1/((x-1)(x+2)) verified at x=2.5")
# Definite integral
a, b = 2.0, 4.0
analytical = antideriv(b) - antideriv(a)
numerical, _ = integrate.quad(f_original, a, b)
ok2 = abs(analytical - numerical) < 1e-8
print(f"{'PASS' if ok2 else 'FAIL'} — ∫₂⁴ 1/((x-1)(x+2)) dx: analytic={analytical:.8f}, numerical={numerical:.8f}")
# Case 2: Irreducible quadratic
# ∫ 1/(x²+1) dx = arctan(x) + C
val, _ = integrate.quad(lambda x: 1/(x**2+1), 0, 1)
ok3 = abs(val - np.pi/4) < 1e-8
print(f"{'PASS' if ok3 else 'FAIL'} — ∫₀¹ 1/(x²+1) dx = π/4 = {np.pi/4:.8f}, got {val:.8f}")
7. Improper Integrals
Type I (infinite limits):
Type II (unbounded integrand):
p-integral test: converges .
Gaussian integral: — central to ML statistics.
Code cell 18
# === 7.1 Improper Integrals - Convergence ===
import numpy as np
from scipy import integrate, special
from scipy.stats import norm
print('=== p-Integral Convergence Test ===')
ps = [0.5, 0.9, 1.0, 1.1, 2.0, 3.0]
for p in ps:
expected_conv = p > 1
if expected_conv:
val, _ = integrate.quad(lambda x: x**(-p), 1, np.inf, limit=200)
analytic = 1.0 / (p - 1.0)
ok = abs(val - analytic) < 1e-6
print(f" {'PASS' if ok else 'FAIL'} p={p}: value={val:.4f}, analytic={analytic:.4f}")
else:
print(f" PASS p={p}: diverges by the p-integral test")
print('\n=== Gaussian Integral ===')
gaussian, _ = integrate.quad(lambda x: np.exp(-x**2), -50, 50, limit=200)
print(f'integral exp(-x^2) dx ~= {gaussian:.8f}')
print(f'sqrt(pi) = {np.sqrt(np.pi):.8f}')
check('Gaussian integral = sqrt(pi)', gaussian, np.sqrt(np.pi), tol=1e-6)
print('\n=== Standard Normal Normalization ===')
norm_integral, _ = integrate.quad(norm.pdf, -50, 50, limit=200)
check('Integral of N(0,1) pdf = 1', norm_integral, 1.0, tol=1e-8)
print('Takeaway: improper integrals need analytic convergence checks before numerical quadrature.')
8. Numerical Integration
When analytic antiderivatives don't exist, we approximate:
| Method | Formula | Error |
|---|---|---|
| Trapezoid | ||
| Simpson's | ||
| Monte Carlo | , |
Monte Carlo is dimension-independent — unlike grid methods which scale as .
Code cell 20
# === 8.1 Numerical Integration — Accuracy Comparison ===
import numpy as np
from scipy import integrate, special
# Integrate f(x) = e^(-x²) on [0, 2]
# True value: (√π/2) * erf(2)
true_val = np.sqrt(np.pi)/2 * special.erf(2)
a, b = 0.0, 2.0
def f(x): return np.exp(-x**2)
def trapezoid(f, a, b, n):
xs = np.linspace(a, b, n+1)
h = (b-a)/n
return h * (f(xs[0])/2 + np.sum(f(xs[1:-1])) + f(xs[-1])/2)
def simpsons(f, a, b, n):
if n % 2 != 0: n += 1
xs = np.linspace(a, b, n+1)
h = (b-a)/n
coeffs = np.ones(n+1)
coeffs[1:-1:2] = 4
coeffs[2:-2:2] = 2
return (h/3) * np.dot(coeffs, f(xs))
def monte_carlo(f, a, b, n, seed=42):
np.random.seed(seed)
xs = np.random.uniform(a, b, n)
return (b-a) * np.mean(f(xs))
print(f'True value: {true_val:.10f}\n')
print(f'{'n':>8} | {'Trap error':>12} | {'Simp error':>12} | {'MC error':>12}')
for n in [10, 50, 100, 500, 1000]:
t_err = abs(trapezoid(f, a, b, n) - true_val)
s_err = abs(simpsons(f, a, b, n) - true_val)
m_err = abs(monte_carlo(f, a, b, n) - true_val)
print(f'{n:8d} | {t_err:12.2e} | {s_err:12.2e} | {m_err:12.2e}')
print('\nSimpson\'s converges much faster (O(h⁴) vs O(h²) for trapezoid)')
print('Monte Carlo: O(1/√n) regardless of dimension — see § on high-dim below')
Code cell 21
# === 8.2 Monte Carlo — Dimension Independence ===
import numpy as np
# Integrate f(x) = exp(-||x||²) over [-3,3]^d
# True value for d dimensions: (√π·erf(3))^d
from scipy.special import erf
def mc_integral_d(d, n=100000, seed=42):
np.random.seed(seed)
xs = np.random.uniform(-3, 3, (n, d))
fvals = np.exp(-np.sum(xs**2, axis=1))
vol = 6.0**d
return vol * np.mean(fvals)
def true_val_d(d):
return (np.sqrt(np.pi) * erf(3))**d
print(f'{'d':>3} | {'True value':>14} | {'MC estimate':>14} | {'Rel error':>10}')
for d in [1, 2, 5, 10, 20]:
true = true_val_d(d)
mc = mc_integral_d(d)
rel_err = abs(mc - true) / abs(true)
print(f'{d:3d} | {true:14.4e} | {mc:14.4e} | {rel_err:10.4f}')
print('\nKey insight: MC error stays ~O(1/√n) regardless of dimension.')
print('Grid methods (trapezoid): need n^d points => exponential cost.')
print('This is why mini-batch SGD works: it is Monte Carlo for E[∇L].')
9. Integration in Probability
Continuous probability lives in the integration framework:
Code cell 23
# === 9.1 PDF/CDF and Expected Values ===
import numpy as np
from scipy import integrate, stats
print('=== Gaussian PDF/CDF Verification ===')
# Standard normal
mu, sigma = 2.0, 1.5
dist = stats.norm(mu, sigma)
# Total probability = 1
total, _ = integrate.quad(dist.pdf, -100, 100)
ok1 = abs(total - 1.0) < 1e-8
print(f"{'PASS' if ok1 else 'FAIL'} — PDF integrates to 1: {total:.10f}")
# E[X] = μ
E_X, _ = integrate.quad(lambda x: x * dist.pdf(x), -100, 100)
ok2 = abs(E_X - mu) < 1e-6
print(f"{'PASS' if ok2 else 'FAIL'} — E[X] = μ = {mu}: got {E_X:.8f}")
# E[X²] = σ² + μ²
E_X2, _ = integrate.quad(lambda x: x**2 * dist.pdf(x), -100, 100)
expected_E_X2 = sigma**2 + mu**2
ok3 = abs(E_X2 - expected_E_X2) < 1e-6
print(f"{'PASS' if ok3 else 'FAIL'} — E[X²] = σ²+μ² = {expected_E_X2}: got {E_X2:.8f}")
# CDF from PDF via FTC
x0 = 3.0
cdf_from_integral, _ = integrate.quad(dist.pdf, -np.inf, x0)
cdf_direct = dist.cdf(x0)
ok4 = abs(cdf_from_integral - cdf_direct) < 1e-8
print(f"{'PASS' if ok4 else 'FAIL'} — CDF(3) = ∫₋∞³ p(x)dx: integral={cdf_from_integral:.8f}, scipy={cdf_direct:.8f}")
Code cell 24
# === 9.2 KL Divergence — Gibbs Inequality Verification ===
import numpy as np
from scipy import integrate, stats
def kl_div_gaussian(mu1, sigma1, mu2, sigma2):
"""KL(N(mu1,sigma1²) || N(mu2,sigma2²)) — analytic formula."""
return (np.log(sigma2/sigma1)
+ (sigma1**2 + (mu1-mu2)**2) / (2*sigma2**2)
- 0.5)
def kl_div_numerical(mu1, sigma1, mu2, sigma2):
"""KL via numerical integration."""
p = stats.norm(mu1, sigma1)
q = stats.norm(mu2, sigma2)
def integrand(x):
px = p.pdf(x)
qx = q.pdf(x)
return px * np.log(px/qx) if px > 1e-300 else 0.0
val, _ = integrate.quad(integrand, mu1 - 10*sigma1, mu1 + 10*sigma1)
return val
print('KL Divergence: Analytic vs Numerical')
test_cases = [
(0.0, 1.0, 0.0, 1.0), # KL(N(0,1)||N(0,1)) = 0
(1.0, 1.0, 0.0, 1.0), # KL(N(1,1)||N(0,1)) = 0.5
(0.0, 1.0, 0.0, 2.0), # different variance
(2.0, 0.5, 0.0, 1.0), # both shifted
]
for mu1, s1, mu2, s2 in test_cases:
analytic = kl_div_gaussian(mu1, s1, mu2, s2)
numerical = kl_div_numerical(mu1, s1, mu2, s2)
ok = abs(analytic - numerical) < 1e-5 and analytic >= -1e-10
print(f" {'PASS' if ok else 'FAIL'} KL(N({mu1},{s1}²)||N({mu2},{s2}²)) = {analytic:.6f} (numerical={numerical:.6f})")
print('\nAll KL values ≥ 0 — Gibbs inequality verified.')
# VAE KL term: KL(N(mu,1) || N(0,1)) = mu²/2
mu_vals = [0.0, 1.0, 2.0, 3.0]
print('\nVAE KL penalty — KL(N(μ,1)||N(0,1)) = μ²/2:')
for mu in mu_vals:
analytic = mu**2 / 2
numerical = kl_div_gaussian(mu, 1.0, 0.0, 1.0)
ok = abs(analytic - numerical) < 1e-6
print(f" {'PASS' if ok else 'FAIL'} μ={mu}: μ²/2={analytic:.4f}, formula={numerical:.6f}")
Code cell 25
# === 9.3 Entropy and Cross-Entropy ===
import numpy as np
from scipy import integrate, stats
print('=== Differential Entropy of Gaussian ===')
# H[N(mu,sigma²)] = (1/2) ln(2πe·sigma²)
sigma = 2.0
dist = stats.norm(0, sigma)
def entropy_integrand(x):
px = dist.pdf(x)
return -px * np.log(px) if px > 1e-300 else 0.0
H_numerical, _ = integrate.quad(entropy_integrand, -20, 20)
H_analytic = 0.5 * np.log(2 * np.pi * np.e * sigma**2)
ok = abs(H_numerical - H_analytic) < 1e-6
print(f"{'PASS' if ok else 'FAIL'} — H[N(0,σ²)]: numerical={H_numerical:.8f}, analytic={H_analytic:.8f}")
print('\n=== Cross-Entropy and NLL ===')
# H(p, q) = -E_p[log q] = H(p) + KL(p||q)
# For Gaussians:
mu1, s1 = 0.0, 1.0 # true p
mu2, s2 = 1.0, 1.5 # model q
p = stats.norm(mu1, s1)
q = stats.norm(mu2, s2)
def ce_integrand(x):
px = p.pdf(x)
qx = q.pdf(x)
return -px * np.log(qx) if px > 1e-300 and qx > 1e-300 else 0.0
H_ce, _ = integrate.quad(ce_integrand, -15, 15)
H_p = 0.5 * np.log(2 * np.pi * np.e * s1**2)
from scipy.integrate import quad as q_int
def kl_int(x):
px, qx = p.pdf(x), q.pdf(x)
return px * np.log(px/qx) if px > 1e-300 and qx > 1e-300 else 0.0
KL, _ = q_int(kl_int, -15, 15)
H_ce_formula = H_p + KL
ok2 = abs(H_ce - H_ce_formula) < 1e-5
print(f"{'PASS' if ok2 else 'FAIL'} — Cross-entropy = H(p) + KL: CE={H_ce:.6f}, H_p+KL={H_ce_formula:.6f}")
print(f' H(p)={H_p:.6f}, KL(p||q)={KL:.6f}')
print('Minimizing CE(p,q) over q = minimizing KL(p||q) = MLE!')
10. Integration in Deep Learning — Neural ODEs
A Neural ODE (Chen et al. 2018) defines a continuous-depth model:
where is a neural network. The forward pass is ODE integration; gradients are computed via the adjoint method (integration backwards in time) using memory instead of .
Code cell 27
# === 10.1 Neural ODE — Simple 1D Example ===
import numpy as np
from scipy.integrate import solve_ivp
# Simple 1D neural ODE: dh/dt = -a*h (exponential decay)
# Solution: h(T) = h(0) * exp(-a*T)
# This mimics a ResNet with infinitely many tiny residual blocks
a = 1.5 # learned parameter
h0 = 2.0
T = 1.0
def ode_func(t, h): return [-a * h[0]]
# Solve ODE = forward pass of Neural ODE
sol = solve_ivp(ode_func, [0, T], [h0], dense_output=True, rtol=1e-8)
h_T_ode = sol.y[0, -1]
h_T_true = h0 * np.exp(-a * T)
ok = abs(h_T_ode - h_T_true) < 1e-6
print(f"{'PASS' if ok else 'FAIL'} — ODE solution: h(T)={h_T_ode:.8f}, true={h_T_true:.8f}")
# Simulate ResNet discretization (Euler method with step size dt)
print('\nResNet (Euler) vs Neural ODE:')
for n_steps in [1, 2, 4, 8, 16, 32]:
dt = T / n_steps
h = h0
for _ in range(n_steps):
h += dt * (-a * h) # Euler step = residual connection
err = abs(h - h_T_true)
print(f' n_steps={n_steps:3d}: h(T)={h:.8f}, error={err:.2e}')
print(f' ODE solver: h(T)={h_T_ode:.8f}, error={abs(h_T_ode-h_T_true):.2e}')
print('\nNeural ODE = ResNet limit as n_steps → ∞, depth → ∞')
Code cell 28
# === 10.2 Adjoint Method — Backprop Through ODE ===
import numpy as np
from scipy.integrate import solve_ivp
# Verify adjoint gradient matches finite differences
# ODE: dh/dt = -a*h, h(0) = h0
# Loss: L = h(T)²/2
# Adjoint: da_dt = a * a (where a = dL/dh)
# Gradient: dL/da = ∫₀ᵀ a(t) * (-h(t)) dt (integral of adjoint × param sensitivity)
a_param = 1.5
h0_val = 2.0
T = 1.0
def h_solution(a, h0=h0_val, t_end=T):
return h0 * np.exp(-a * t_end)
def loss(a):
return h_solution(a)**2 / 2
# Analytic gradient: dL/da
# L = (h0*e^(-aT))²/2 => dL/da = -T * h0² * e^(-2aT)
def grad_analytic(a):
return -T * h0_val**2 * np.exp(-2*a*T)
# Numerical gradient
def grad_numerical(a, h=1e-5):
return (loss(a+h) - loss(a-h)) / (2*h)
print('Adjoint Gradient Verification:')
for a_test in [0.5, 1.0, 1.5, 2.0]:
ga = grad_analytic(a_test)
gn = grad_numerical(a_test)
ok = abs(ga - gn) < 1e-5
print(f" {'PASS' if ok else 'FAIL'} a={a_test}: analytic={ga:.8f}, numerical={gn:.8f}")
print('\nThe adjoint method computes this via backward ODE integration')
print('= O(1) memory (vs O(T) for storing intermediate states)')
11. Attention Mechanism as Continuous Integration
Scaled dot-product attention can be viewed as a discrete approximation to:
where is a softmax distribution and the integral is approximated by summing over the key-value pairs. This motivates linear attention (kernel approximations) and continuous attention (over sequences as functions).
Code cell 30
# === 11.1 Attention as Discrete Integration (Expected Value) ===
import numpy as np
np.random.seed(42)
# Attention: output = sum_j softmax(q·K/√d)_j * V_j
# = E_{j~softmax}[V_j] = discrete expected value of V under softmax distribution
d = 8 # embedding dim
n = 10 # sequence length
q = np.random.randn(d) # query
K = np.random.randn(n, d) # keys
V = np.random.randn(n, d) # values
# Standard attention
scores = K @ q / np.sqrt(d) # (n,)
scores -= scores.max() # numerical stability
weights = np.exp(scores)
weights /= weights.sum() # softmax = probability distribution
# Attention output = weighted sum of V = discrete E[V]
attn_output = weights @ V # (d,)
# Verify: this is exactly E_{j~softmax}[V_j]
E_V = sum(weights[j] * V[j] for j in range(n))
ok = np.allclose(attn_output, E_V)
print(f"{'PASS' if ok else 'FAIL'} — Attention = E_softmax[V]")
print(f'Attention weights (softmax probs): {weights}')
print(f'sum(weights) = {weights.sum():.8f} — verified probability distribution')
# Entropy of attention distribution
H_attn = -np.sum(weights * np.log(weights + 1e-300))
print(f'\nAttention entropy: {H_attn:.4f} nats')
print(f'Max possible (uniform): {np.log(n):.4f} nats')
print(f'Min possible (one-hot): 0')
print('\nHigh-entropy attention = attending broadly (integration)')
print('Low-entropy attention = attending sharply (evaluation at a point)')
12. VAE ELBO — Integration in Generative Models
The Variational Autoencoder objective (ELBO) is:
Both terms are integrals. The reconstruction term is estimated via Monte Carlo; the KL term is computed analytically for Gaussian and prior .
Code cell 32
# === 12.1 VAE ELBO — Decomposition ===
import numpy as np
from scipy import stats
np.random.seed(42)
# Toy VAE: encoder q(z|x) = N(mu_phi(x), sigma_phi²(x))
# Prior: p(z) = N(0, I)
# Decoder: p(x|z) = N(mu_theta(z), I)
# Simulated encoder outputs for a mini-batch
batch_size = 64
z_dim = 4
x_dim = 8
# Encoder outputs
mu_phi = 0.5 * np.random.randn(batch_size, z_dim)
log_sigma = -0.5 * np.ones((batch_size, z_dim)) # sigma=exp(-0.5)≈0.6
sigma_phi = np.exp(log_sigma)
# Reparameterization trick: z = mu + sigma * epsilon, epsilon ~ N(0,I)
epsilon = np.random.randn(batch_size, z_dim)
z_samples = mu_phi + sigma_phi * epsilon
# Decoder: mu_theta(z) = W*z + b (linear for demo)
W = np.random.randn(z_dim, x_dim) / np.sqrt(z_dim)
x_true = np.random.randn(batch_size, x_dim)
mu_theta = z_samples @ W
# Reconstruction term: E_q[log p(x|z)] = -||x - mu_theta||² / 2 + const
recon_loss = -0.5 * np.mean(np.sum((x_true - mu_theta)**2, axis=1))
# KL term: KL(N(mu_phi, sigma²) || N(0,I)) = sum_j [mu_j²/2 + sigma_j²/2 - log(sigma_j) - 1/2]
kl_per_dim = 0.5 * (mu_phi**2 + sigma_phi**2 - 2*log_sigma - 1)
kl_loss = np.mean(np.sum(kl_per_dim, axis=1)) # mean over batch
elbo = recon_loss - kl_loss
print('=== VAE ELBO Components ===')
print(f'Reconstruction term: E[log p(x|z)] = {recon_loss:.4f}')
print(f'KL term: KL(q||p) = {kl_loss:.4f}')
print(f'ELBO = recon - KL = {elbo:.4f}')
# KL must be ≥ 0 (Gibbs inequality)
print(f'\nKL ≥ 0: {kl_loss >= 0}')
print('KL close to 0 ↔ encoder close to prior (posterior collapse risk)')
print('KL large → encoder far from prior (effective latent usage)')
print('\nELBO = integral-based bound on log-likelihood log p(x)')
13. Score Matching and Diffusion Models
Diffusion models (DDPM, DDIM, score-based SDEs) train a neural network to estimate the score function . The training objective is:
This expectation is an integral over the data distribution. The score is the gradient of the log-density — a key role of differentiation in integration.
Code cell 34
# === 13.1 Score Function of a Gaussian ===
import numpy as np
from scipy import stats, integrate
# For p(x) = N(mu, sigma²):
# score = d/dx log p(x) = -(x - mu) / sigma²
mu, sigma = 2.0, 1.5
dist = stats.norm(mu, sigma)
def score_analytic(x): return -(x - mu) / sigma**2
# Numerical: d/dx log p(x) via finite differences
def score_numerical(x, h=1e-5):
return (np.log(dist.pdf(x+h) + 1e-300) - np.log(dist.pdf(x-h) + 1e-300)) / (2*h)
print('Score function s(x) = d/dx log p(x) for N(2, 1.5²):')
xs = np.array([-1.0, 0.0, 1.0, 2.0, 3.0, 4.0])
for x in xs:
sa = score_analytic(x)
sn = score_numerical(x)
ok = abs(sa - sn) < 1e-4
print(f" {'PASS' if ok else 'FAIL'} x={x:5.1f}: analytic={sa:8.4f}, numerical={sn:8.4f}")
# Score = 0 at the mode
print(f'\nScore at mean x=mu: {score_analytic(mu):.8f} (should be 0)')
print('Score points TOWARD the mean — this is what diffusion uses to denoise')
# Denoising: x_noisy = x_clean + sigma_noise * epsilon
# Optimal denoiser: E[x_clean | x_noisy] = x_noisy + sigma_noise² * score(x_noisy)
print('\nDiffusion: score function guides noisy sample back to clean data distribution')
Code cell 35
# === 13.2 Monte Carlo Estimation of Expected Loss ===
import numpy as np
from scipy import stats
# Training loss = E_p[L(x, theta)]
# = ∫ L(x, theta) p(x) dx [integral over data distribution]
# Approximated by Monte Carlo: (1/n) sum_i L(x_i, theta)
np.random.seed(42)
# Simple regression: p(x) = N(0,1), L(x, theta) = (x - theta)²
# E[L] = E[(X-theta)²] = Var[X] + (E[X]-theta)² = 1 + theta²
# Minimized at theta=0 with E[L]=1
theta = 0.5
true_loss = 1 + theta**2 # analytic
n_samples_list = [10, 100, 1000, 10000, 100000]
print(f'True expected loss: {true_loss:.6f}')
print(f'{'n':>8} | {'MC estimate':>12} | {'Error':>10} | {'Std err':>10}')
for n in n_samples_list:
x_samples = np.random.randn(n)
losses = (x_samples - theta)**2
mc_est = np.mean(losses)
std_err = np.std(losses) / np.sqrt(n)
err = abs(mc_est - true_loss)
print(f'{n:8d} | {mc_est:12.6f} | {err:10.6f} | {std_err:10.6f}')
print('\nMC error ~ O(1/√n) — matches theoretical CLT bound')
print('This IS mini-batch SGD: each mini-batch is a MC estimate of E[∇L]')
14. Gaussian Quadrature — Optimal Node Selection
Gauss-Legendre quadrature selects nodes and weights to exactly integrate any polynomial of degree :
Nodes are roots of Legendre polynomials . Error: for nodes — exponentially better than trapezoid.
Code cell 37
# === 14.1 Gaussian Quadrature vs Simpson's ===
import numpy as np
from numpy.polynomial.legendre import leggauss
from scipy import integrate, special
def gauss_legendre(f, a, b, n):
"""n-point Gauss-Legendre quadrature on [a,b]."""
nodes, weights = leggauss(n)
# Transform from [-1,1] to [a,b]
xs = 0.5*(b-a)*nodes + 0.5*(b+a)
ws = 0.5*(b-a)*weights
return np.dot(ws, f(xs))
def simpsons(f, a, b, n=100):
if n % 2 != 0: n += 1
xs = np.linspace(a, b, n+1)
h = (b-a)/n
coeffs = np.ones(n+1); coeffs[1:-1:2]=4; coeffs[2:-2:2]=2
return (h/3)*np.dot(coeffs, f(xs))
a, b = 0.0, 2.0
f = lambda x: np.exp(-x**2)
true_val = np.sqrt(np.pi)/2 * special.erf(2)
print(f'True ∫₀² e^(-x²) dx = {true_val:.12f}')
print(f'{'Method':>20} | {'n':>4} | {'Estimate':>16} | {'Error':>10}')
for n in [2, 3, 5, 8, 10]:
gl = gauss_legendre(f, a, b, n)
err = abs(gl - true_val)
print(f"{'Gauss-Legendre':>20} | {n:4d} | {gl:16.12f} | {err:10.2e}")
for n in [10, 50, 100, 500]:
simp = simpsons(f, a, b, n)
err = abs(simp - true_val)
print(f"{'Simpsons':>20} | {n:4d} | {simp:16.12f} | {err:10.2e}")
print('\n5-point Gauss-Legendre beats 500-point Simpson\'s!')
print('Reason: Gauss nodes are OPTIMAL (not equally spaced)')
15. Trigonometric Substitution
For integrands containing square roots, trig substitutions eliminate them:
| Expression | Substitution | Identity used |
|---|---|---|
Code cell 39
# === 15.1 Trig Substitution — Area of Semicircle ===
import numpy as np
from scipy import integrate
# ∫₀¹ √(1-x²) dx = π/4 (area of quarter circle)
# x = sin(θ), dx = cos(θ)dθ, √(1-x²) = cos(θ)
# ∫₀^(π/2) cos²(θ) dθ = π/4
# Original form
val1, _ = integrate.quad(lambda x: np.sqrt(1 - x**2), 0, 1)
ok1 = abs(val1 - np.pi/4) < 1e-8
print(f"{'PASS' if ok1 else 'FAIL'} — ∫₀¹ √(1-x²) dx = π/4 = {np.pi/4:.8f}, got {val1:.8f}")
# After trig substitution: same integral in θ-form
val2, _ = integrate.quad(lambda theta: np.cos(theta)**2, 0, np.pi/2)
ok2 = abs(val2 - np.pi/4) < 1e-8
print(f"{'PASS' if ok2 else 'FAIL'} — ∫₀^(π/2) cos²(θ) dθ = π/4 = {val2:.8f}")
# ML Application: normalizing constant of von Mises distribution
# p(θ;κ) ∝ exp(κ cos θ) — circular distribution for angles
# Used in positional encoding analysis
print('\n=== von Mises Normalization Constant ===')
from scipy.special import i0 # modified Bessel function
for kappa in [0.0, 0.5, 1.0, 2.0, 5.0]:
Z_numerical, _ = integrate.quad(
lambda theta: np.exp(kappa * np.cos(theta)), 0, 2*np.pi)
Z_analytic = 2 * np.pi * i0(kappa)
ok = abs(Z_numerical - Z_analytic) / Z_analytic < 1e-6
print(f" {'PASS' if ok else 'FAIL'} κ={kappa:.1f}: Z={Z_numerical:.6f} (2π·I₀(κ)={Z_analytic:.6f})")
16. Policy Gradients — REINFORCE in Detail
The REINFORCE gradient estimator:
is derived by differentiating under the integral sign (Leibniz rule) and using the log-derivative trick .
Code cell 41
# === 16.1 REINFORCE Gradient — Bandit Demo ===
import numpy as np
np.random.seed(42)
# 2-armed bandit: theta = [logit_arm1, logit_arm2]
# R(a=0)=1, R(a=1)=3 — we want to find theta maximizing E[R]
def softmax(logits):
e = np.exp(logits - logits.max())
return e / e.sum()
theta = np.array([0.0, 0.0]) # start uniform
lr = 0.05
rewards = np.array([1.0, 3.0])
# Analytic: E[R] = pi_0 * 1 + pi_1 * 3 = 1 + 2*pi_1
# Maximized at pi_1 = 1 (all mass on arm 1)
# REINFORCE gradient: E[R * score]
loss_history = []
for step in range(300):
pi = softmax(theta)
# Sample action
a = np.random.choice(2, p=pi)
r = rewards[a]
# Score function: d/dtheta log pi(a|theta)
# For softmax: score = e_a - pi (indicator - prob vector)
score = np.zeros(2)
score[a] = 1.0
score -= pi
# REINFORCE update (gradient ascent)
theta += lr * r * score
loss_history.append(np.dot(softmax(theta), rewards))
final_pi = softmax(theta)
final_E_R = np.dot(final_pi, rewards)
print(f'Final policy: pi = [{final_pi[0]:.4f}, {final_pi[1]:.4f}]')
print(f'Final E[R] = {final_E_R:.4f} (max possible: 3.0)')
ok = final_E_R > 2.8
print(f"{'PASS' if ok else 'FAIL'} — REINFORCE converged to near-optimal policy")
print('\nThe gradient ∇E[R] was computed purely from SAMPLES via the score function')
print('No analytic formula for p(tau) needed — just log p(a|theta)')
17. Importance Sampling — Change of Measure
To estimate using samples from :
The ratio is the importance weight. Used in off-policy RL (PPO clipping), self-normalized IS, and AIS.
Code cell 43
# === 17.1 Importance Sampling — Rare Event Estimation ===
import numpy as np
from scipy import stats
np.random.seed(42)
# Estimate P(X > 4) for X ~ N(0,1)
# True value: 1 - Phi(4) ≈ 3.167e-5 (rare event)
from scipy.stats import norm
true_prob = norm.sf(4) # survival function
print(f'True P(X > 4) = {true_prob:.2e}')
n = 100000
# Naive MC: sample X ~ N(0,1), compute fraction > 4
x_naive = np.random.randn(n)
mc_naive = np.mean(x_naive > 4)
print(f'Naive MC: {mc_naive:.2e} (expected ~{true_prob:.2e})')
# Importance sampling: proposal q = N(4.5, 1) (shifted toward rare region)
mu_q = 4.5
x_is = np.random.randn(n) + mu_q # sample from q
# Importance weights: p(x)/q(x) = exp(-x²/2) / exp(-(x-mu_q)²/2)
log_w = (-x_is**2/2) - (-(x_is-mu_q)**2/2)
w = np.exp(log_w)
# IS estimate: E_q[1(x>4) * w(x)]
mc_is = np.mean((x_is > 4).astype(float) * w)
print(f'IS MC: {mc_is:.2e} (proposal q=N({mu_q},1))')
ok_naive = mc_naive == 0 or abs(mc_naive - true_prob)/true_prob < 0.5
ok_is = abs(mc_is - true_prob)/true_prob < 0.01
print(f"\n{'PASS' if ok_is else 'FAIL'} — IS estimate accurate to 1%")
print(f"Naive MC needs ~{int(1/true_prob):,} samples to see ONE event")
print('IS concentrates samples where the event occurs => huge variance reduction')
18. Error Analysis — Richardson Extrapolation
Richardson extrapolation eliminates the leading error term to get higher-order accuracy:
Two trapezoid estimates with step and combine to cancel the term:
Code cell 45
# === 18.1 Richardson Extrapolation ===
import numpy as np
from scipy import integrate, special
f = lambda x: np.exp(-x**2)
a, b = 0.0, 2.0
true_val = np.sqrt(np.pi)/2 * special.erf(2)
def trap(n):
xs = np.linspace(a, b, n+1)
h = (b-a)/n
return h * (f(xs[0])/2 + np.sum(f(xs[1:-1])) + f(xs[-1])/2)
print('Richardson Extrapolation:')
print(f'{'n':>6} | {'T_n':>16} | {'Richardson':>16} | {'Simp error':>12}')
for n in [4, 8, 16, 32, 64]:
Th = trap(n)
Th2 = trap(2*n)
# Richardson: (4*T_{h/2} - T_h) / 3 = Simpson's
Richardson = (4*Th2 - Th) / 3
err_R = abs(Richardson - true_val)
err_T = abs(Th - true_val)
print(f'{n:6d} | {Th:16.12f} | {Richardson:16.12f} | {err_R:12.2e}')
print('\nRichardson extrapolation = Simpson\'s rule!')
print('Idea extends to Romberg integration (repeated extrapolation)')
19. Summary — Integration in ML
| Concept | ML Application | Section |
|---|---|---|
| Riemann sums | Mini-batch as sample average | §2 |
| FTC (Part 2) | Computing definite losses and probabilities | §3 |
| u-substitution | Change of variables in normalizing flows | §4 |
| Integration by parts | REINFORCE log-derivative trick | §5 |
| Improper integrals | Normalization of Gaussian (), KL | §7 |
| Monte Carlo | SGD = stochastic gradient = MC integration | §8 |
| Probability integrals | Expected loss, KL, cross-entropy, entropy | §9 |
| Neural ODEs | Continuous-depth networks, adjoint backprop | §10 |
| Attention as expectation | Softmax as discrete probability, kernel attention | §11 |
| VAE ELBO | Reconstruction + KL = two integrals | §12 |
| Score matching | Diffusion models train score | §13 |
| Gaussian quadrature | Optimal numerical integration | §14 |
| Importance sampling | Off-policy RL, rare-event estimation | §17 |
Key takeaway: Integration is not just a calculus technique — it is the mathematical language of probability, expected values, and the training objectives of modern ML. Every loss function you minimize is an integral.
Code cell 47
# === 19. Final Verification Suite ===
import numpy as np
from scipy import integrate, stats, special
results = []
# 1. FTC: ∫₀^π sin(x) = 2
v, _ = integrate.quad(np.sin, 0, np.pi)
results.append(('FTC: ∫₀^π sin(x)dx = 2', np.allclose(v, 2.0)))
# 2. Gaussian integral: ∫_{-∞}^∞ e^(-x²) = √π
v, _ = integrate.quad(lambda x: np.exp(-x**2), -50, 50)
results.append(('Gaussian: ∫e^(-x²)dx = √π', np.allclose(v, np.sqrt(np.pi), atol=1e-6)))
# 3. Normal PDF normalizes to 1
v, _ = integrate.quad(stats.norm.pdf, -50, 50)
results.append(('Normal PDF integrates to 1', np.allclose(v, 1.0, atol=1e-8)))
# 4. KL(N(1,1)||N(0,1)) = 0.5
def kl_integrand(x):
p = stats.norm(1,1).pdf(x); q = stats.norm(0,1).pdf(x)
return p*np.log(p/q) if p>1e-300 and q>1e-300 else 0.0
v, _ = integrate.quad(kl_integrand, -20, 20)
results.append(('KL(N(1,1)||N(0,1)) = 0.5', np.allclose(v, 0.5, atol=1e-5)))
# 5. p-integral: ∫₁^∞ x⁻² dx = 1
v, _ = integrate.quad(lambda x: x**(-2), 1, np.inf)
results.append(('∫₁^∞ x⁻² dx = 1', np.allclose(v, 1.0, atol=1e-8)))
# 6. Entropy of N(0,1) = 0.5*ln(2πe)
dist = stats.norm(0,1)
def h_int(x): p=dist.pdf(x); return -p*np.log(p) if p>1e-300 else 0.0
v, _ = integrate.quad(h_int, -20, 20)
expected_H = 0.5*np.log(2*np.pi*np.e)
results.append(('Entropy N(0,1) = 0.5·ln(2πe)', np.allclose(v, expected_H, atol=1e-6)))
print('=== Final Verification Suite ===')
for name, ok in results:
print(f"{'PASS' if ok else 'FAIL'} — {name}")
n_pass = sum(ok for _, ok in results)
print(f'\n{n_pass}/{len(results)} passed.')
print('\nIntegration: the engine beneath probability, statistics, and ML.')
20. Special Integrals — Gamma and Beta Functions
These appear in Dirichlet distributions (Bayesian topic models, Dirichlet process priors in LLM fine-tuning), Beta distributions, and Bayesian inference broadly.
Code cell 49
# === 20.1 Gamma and Beta Functions ===
import numpy as np
from scipy import integrate, special
print('=== Gamma Function ===')
# Gamma(n) = (n-1)! for positive integers
for n in [1, 2, 3, 4, 5, 6]:
G_numerical, _ = integrate.quad(lambda x: x**(n-1) * np.exp(-x), 0, np.inf)
G_analytic = special.gamma(n)
G_factorial = float(math.factorial(n-1)) if n <= 10 else float('inf')
ok = abs(G_numerical - G_analytic) / G_analytic < 1e-6
print(f" {'PASS' if ok else 'FAIL'} Γ({n}) = {G_numerical:.6f} = {int(G_factorial)}!")
# Γ(1/2) = √π
G_half, _ = integrate.quad(lambda x: x**(-0.5) * np.exp(-x), 0, np.inf)
ok = abs(G_half - np.sqrt(np.pi)) < 1e-6
print(f"\n{'PASS' if ok else 'FAIL'} Γ(1/2) = √π = {np.sqrt(np.pi):.8f}, got {G_half:.8f}")
print('\n=== Beta Function ===')
# B(a,b) = Γ(a)Γ(b)/Γ(a+b)
test_cases = [(2.0, 3.0), (0.5, 0.5), (1.0, 1.0), (3.0, 2.0)]
for a, b in test_cases:
B_num, _ = integrate.quad(lambda x: x**(a-1) * (1-x)**(b-1), 0, 1)
B_formula = special.gamma(a)*special.gamma(b)/special.gamma(a+b)
ok = abs(B_num - B_formula) / B_formula < 1e-6
print(f" {'PASS' if ok else 'FAIL'} B({a},{b}) = {B_num:.8f} = Γ(a)Γ(b)/Γ(a+b) = {B_formula:.8f}")
print('\nDirichlet distribution uses B(alpha) as normalizing constant')
print('Bayesian LM: Dirichlet prior on token probabilities')
21. Convolution as Integration
The convolution of two functions:
Convolution is the foundation of CNN filters (discrete case), signal processing, and the convolution theorem connecting to Fourier transforms.
Code cell 51
# === 21.1 Convolution — Gaussian Self-Convolution ===
import numpy as np
from scipy import stats, signal
# N(0,1) * N(0,1) = N(0,2) (sum of independent normals)
# Proof via convolution integral
# Numerical convolution via scipy
x = np.linspace(-10, 10, 2001)
dx = x[1] - x[0]
f1 = stats.norm(0, 1).pdf(x)
f2 = stats.norm(0, 1).pdf(x)
# Discrete convolution approximating the integral
conv = np.convolve(f1, f2, mode='same') * dx
# True result: N(0, √2)
true_conv = stats.norm(0, np.sqrt(2)).pdf(x)
# Compare at a few points
print('Convolution N(0,1)*N(0,1) = N(0,√2):')
for x_val in [-2.0, -1.0, 0.0, 1.0, 2.0]:
idx = np.argmin(abs(x - x_val))
conv_val = conv[idx]
true_val = true_conv[idx]
ok = abs(conv_val - true_val) < 0.01
print(f" {'PASS' if ok else 'FAIL'} x={x_val:5.1f}: conv={conv_val:.6f}, true={true_val:.6f}")
# In CNNs: discrete convolution = applying learned filters
print('\nCNN filter = discrete approximation to continuous convolution integral')
print('Gaussian blur kernel = discretized N(0,σ²) convolution')
22. Numerical Stability in Integration
Floating-point errors accumulate in numerical integration. Key issues:
- Cancellation error: subtracting nearly equal numbers loses precision
- Log-sum-exp trick: compute stably by subtracting max
- Kahan summation: compensated summation reduces floating-point error from to
Code cell 53
# === 22.1 Log-Sum-Exp and Numerical Stability ===
import numpy as np
# Log-sum-exp: log(sum_i exp(x_i)) — naive overflow
def logsumexp_naive(x):
return np.log(np.sum(np.exp(x)))
def logsumexp_stable(x):
m = np.max(x)
return m + np.log(np.sum(np.exp(x - m)))
x_small = np.array([1.0, 2.0, 3.0])
x_large = np.array([1000.0, 1001.0, 1002.0]) # naive overflows
x_neg = np.array([-1000.0, -1001.0, -1002.0]) # naive underflows to 0
from scipy.special import logsumexp as scipy_lse
print('Log-Sum-Exp Stability:')
for name, x in [('small', x_small), ('large', x_large), ('negative', x_neg)]:
naive = logsumexp_naive(x)
stable = logsumexp_stable(x)
scipy_val = scipy_lse(x)
ok = np.isfinite(stable) and abs(stable - scipy_val) < 1e-8
print(f" {'PASS' if ok else 'FAIL'} {name}: naive={naive}, stable={stable:.6f}, scipy={scipy_val:.6f}")
# Kahan summation
def kahan_sum(xs):
s, c = 0.0, 0.0
for x in xs:
y = x - c
t = s + y
c = (t - s) - y
s = t
return s
print('\nKahan summation vs naive:')
# Sum of 1/n for n=1..N — compare with true harmonic number
N = 1000000
xs = np.array([1/n for n in range(1, N+1)])
naive_sum = float(np.sum(xs))
kahan = kahan_sum(xs)
# True: H_N ≈ ln(N) + gamma
euler_gamma = 0.5772156649
true_approx = np.log(N) + euler_gamma
print(f' Naive: {naive_sum:.10f}')
print(f' Kahan: {kahan:.10f}')
print(f' ~True: {true_approx:.10f} (Euler-Mascheroni estimate)')