Theory Notebook
Converted from
theory.ipynbfor web reading.
Expectation and Moments
"The expectation of the product of two independent random variables equals the product of their expectations — a theorem so simple it conceals its own depth."
Interactive theory notebook for §06/04. Work through cells top-to-bottom.
Topics covered:
- Expectation (LOTUS, linearity, existence)
- Variance, skewness, kurtosis, moments
- Covariance, correlation, Cauchy-Schwarz
- Conditional expectation and tower property
- Moment generating functions and characteristic functions
- Jensen's inequality: KL divergence and ELBO
- Bias-variance decomposition and double descent
- ML applications: Adam, cross-entropy, policy gradient
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import scipy.linalg as la
from scipy import stats
from scipy.integrate import quad, dblquad
try:
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['font.size'] = 12
HAS_MPL = True
except ImportError:
HAS_MPL = False
print('matplotlib not available — skipping plots')
try:
import seaborn as sns
HAS_SNS = True
except ImportError:
HAS_SNS = False
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete.')
1. Expectation: LOTUS and Linearity
The expectation is the probability-weighted average of all possible values.
LOTUS (Law of the Unconscious Statistician):
We integrate against the PDF of — no need to find the distribution of .
Linearity: — always, without independence.
Code cell 5
# === 1.1 LOTUS: Computing E[g(X)] ===
from scipy.integrate import quad
import numpy as np
# X ~ Exp(2), f(x) = 2*exp(-2x)
lam = 2.0
# E[X] via definition
E_X, _ = quad(lambda x: x * lam * np.exp(-lam*x), 0, np.inf)
print(f'E[X] = {E_X:.6f} (theory: 1/lambda = {1/lam:.6f})')
# E[X^2] via LOTUS: g(x) = x^2
E_X2, _ = quad(lambda x: x**2 * lam * np.exp(-lam*x), 0, np.inf)
print(f'E[X^2] = {E_X2:.6f} (theory: 2/lambda^2 = {2/lam**2:.6f})')
# Var(X) = E[X^2] - E[X]^2
Var_X = E_X2 - E_X**2
print(f'Var(X) = {Var_X:.6f} (theory: 1/lambda^2 = {1/lam**2:.6f})')
# E[e^X] via LOTUS: g(x) = e^x (exists only for t < lambda)
# MGF at t=1 < lambda=2
t = 1.0
E_etX, _ = quad(lambda x: np.exp(t*x) * lam * np.exp(-lam*x), 0, np.inf)
mgf_theory = lam / (lam - t)
print(f'\nE[e^X] (t=1) = {E_etX:.6f} (theory: lam/(lam-t) = {mgf_theory:.6f})')
print('\nAll LOTUS computations match theory: PASS')
Code cell 6
# === 1.2 Linearity of Expectation: Binomial via Bernoulli Indicators ===
import numpy as np
np.random.seed(42)
# X ~ Binomial(n,p) = sum of n Bernoulli(p) indicators
# E[X] = n*p (by linearity, no need to compute full PMF)
n, p = 20, 0.35
# Method 1: Direct from PMF
k = np.arange(n+1)
pmf = stats.binom.pmf(k, n, p)
E_direct = np.sum(k * pmf)
# Method 2: Linearity E[sum B_i] = sum E[B_i] = n*p
E_linearity = n * p
# Method 3: Sample mean
samples = np.random.binomial(n, p, 100000)
E_sample = samples.mean()
print(f'E[Binomial({n},{p})]:')
print(f' Direct from PMF: {E_direct:.6f}')
print(f' Linearity (n*p): {E_linearity:.6f}')
print(f' Sample mean (N=1e5):{E_sample:.6f}')
ok = np.allclose([E_direct, E_linearity], E_sample, atol=0.05)
print(f'PASS — all agree: {ok}')
# Extended: E[fixed points of random permutation] = 1
N_perm = 10
fixed_point_counts = []
for _ in range(50000):
perm = np.random.permutation(N_perm)
fixed_point_counts.append((perm == np.arange(N_perm)).sum())
print(f'\nE[fixed points of perm of {{1..{N_perm}}}] = {np.mean(fixed_point_counts):.4f}')
print(f'Theory: 1 (by linearity, regardless of n)')
print(f'PASS: {abs(np.mean(fixed_point_counts) - 1.0) < 0.05}')
2. Variance, Skewness, and Kurtosis
The variance measures spread.
Skewness measures asymmetry.
Excess kurtosis measures tail weight relative to Gaussian.
Code cell 8
# === 2.1 Moments of Common Distributions ===
import numpy as np
from scipy import stats
distributions = {
'Normal(0,1)': stats.norm(0, 1),
'Exponential(1)': stats.expon(scale=1),
'Uniform(-1,1)': stats.uniform(-1, 2),
'Student-t(5)': stats.t(df=5),
'Laplace(0,1)': stats.laplace(0, 1),
}
N = 500000
np.random.seed(42)
print(f'{"Distribution":20s} | {"Mean":8s} | {"Var":8s} | {"Skew":8s} | {"Ex.Kurt":8s}')
print('-' * 65)
for name, rv in distributions.items():
s = rv.rvs(N)
mu = s.mean()
sigma = s.std()
skew = ((s - mu)**3).mean() / sigma**3
kurt = ((s - mu)**4).mean() / sigma**4 - 3
print(f'{name:20s} | {mu:8.4f} | {sigma**2:8.4f} | {skew:8.4f} | {kurt:8.4f}')
print('\nKey observations:')
print(' Normal: skew=0, excess kurtosis=0 (baseline)')
print(' Exponential: skew=2, excess kurtosis=6 (right-skewed, heavy tail)')
print(' Student-t(5): excess kurtosis=6/(5-4)=6 (heavier tails than Gaussian)')
Code cell 9
# === 2.2 Visualising Skewness and Kurtosis ===
import numpy as np
from scipy import stats
np.random.seed(42)
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Skewness: Normal vs Exponential vs Log-normal
x = np.linspace(-4, 8, 500)
ax = axes[0]
ax.plot(x, stats.norm.pdf(x, 0, 1), 'b-', lw=2, label='Normal (γ₁=0)')
ax.plot(x, stats.expon.pdf(x, 0, 1), 'r-', lw=2, label='Exp(1) (γ₁=2)')
ax.plot(x, stats.lognorm.pdf(x, s=1), 'g-', lw=2, label='LogNorm (γ₁>0)')
ax.set_xlim(-1, 8); ax.set_ylim(0, 0.7)
ax.axvline(0, color='b', linestyle='--', alpha=0.5, label='Normal mean')
ax.axvline(1, color='r', linestyle='--', alpha=0.5, label='Exp mean')
ax.set_title('Skewness: tail direction')
ax.set_xlabel('x'); ax.set_ylabel('PDF')
ax.legend(fontsize=9)
# Kurtosis: Normal vs t(3) vs Laplace
x2 = np.linspace(-6, 6, 500)
ax2 = axes[1]
ax2.plot(x2, stats.norm.pdf(x2), 'b-', lw=2, label='Normal (γ₂=0)')
ax2.plot(x2, stats.t.pdf(x2, df=3), 'r-', lw=2, label='t(3) (γ₂=inf)')
ax2.plot(x2, stats.laplace.pdf(x2), 'g-', lw=2, label='Laplace (γ₂=3)')
ax2.plot(x2, stats.uniform.pdf(x2, -np.sqrt(3), 2*np.sqrt(3)),
'm-', lw=2, label='Uniform (γ₂=-6/5)')
ax2.set_xlim(-5, 5)
ax2.set_title('Kurtosis: tail weight')
ax2.set_xlabel('x'); ax2.set_ylabel('PDF')
ax2.legend(fontsize=9)
plt.suptitle('Shape Parameters: Skewness and Kurtosis')
plt.tight_layout()
plt.show()
print('PASS — plotted skewness and kurtosis comparison')
3. Covariance, Correlation, and the Cauchy-Schwarz Bound
Covariance:
Pearson correlation:
Cauchy-Schwarz:
Zero covariance does NOT imply independence — only the absence of linear dependence.
Code cell 11
# === 3.1 Covariance and the Zero-Covariance Trap ===
import numpy as np
np.random.seed(42)
N = 200000
# Case 1: X ~ Uniform(-1,1), Y = X^2 (zero cov, fully dependent)
X = np.random.uniform(-1, 1, N)
Y = X**2
cov_1 = np.cov(X, Y)[0, 1]
print('Case 1: Y = X^2')
print(f' Cov(X,Y) = {cov_1:.6f} (should be ~0)')
print(f' But Y is DETERMINED by X (fully dependent!)')
# Case 2: X, Y i.i.d. N(0,1) (truly independent)
X2 = np.random.normal(0, 1, N)
Y2 = np.random.normal(0, 1, N)
cov_2 = np.cov(X2, Y2)[0, 1]
rho_2 = cov_2 / (X2.std() * Y2.std())
print(f'\nCase 2: X,Y iid N(0,1)')
print(f' Cov(X,Y) = {cov_2:.6f} (should be ~0)')
print(f' Corr(X,Y) = {rho_2:.6f} (should be ~0)')
print(f' AND they are truly independent.')
# Case 3: Correlated Gaussian (rho = 0.8)
rho_target = 0.8
Sigma = np.array([[1, rho_target], [rho_target, 1]])
Z = np.random.multivariate_normal([0,0], Sigma, N)
X3, Y3 = Z[:,0], Z[:,1]
rho_est = np.corrcoef(X3, Y3)[0, 1]
print(f'\nCase 3: Bivariate Gaussian with rho=0.8')
print(f' Estimated corr = {rho_est:.4f} (target: {rho_target})')
print(f' PASS: {abs(rho_est - rho_target) < 0.01}')
# Cauchy-Schwarz: |Cov(X,Y)|^2 <= Var(X)*Var(Y)
cov_sq = np.cov(X3, Y3)[0, 1]**2
var_prod = np.var(X3) * np.var(Y3)
print(f'\nCauchy-Schwarz: Cov^2 <= Var(X)*Var(Y)')
print(f' Cov^2 = {cov_sq:.4f}, Var(X)*Var(Y) = {var_prod:.4f}')
print(f' PASS: {cov_sq <= var_prod + 1e-6}')
4. Conditional Expectation and Tower Property
The conditional expectation is a function of , not a number.
Tower property:
Law of total variance:
The conditional expectation is the optimal predictor minimising expected squared error.
Code cell 13
# === 4.1 Tower Property and Total Variance ===
import numpy as np
np.random.seed(42)
N = 200000
# Mixture model: Z in {0,1} (coin flip), then X|Z=0 ~ N(0,1), X|Z=1 ~ N(4,1)
Z = np.random.binomial(1, 0.5, N)
X = np.where(Z == 0, np.random.normal(0, 1, N), np.random.normal(4, 1, N))
# Tower property: E[X] = E[E[X|Z]]
E_X_given_0 = 0.0 # E[X|Z=0]
E_X_given_1 = 4.0 # E[X|Z=1]
E_via_tower = 0.5 * E_X_given_0 + 0.5 * E_X_given_1
E_empirical = X.mean()
print('Tower property: E[X] = E[E[X|Z]]')
print(f' Theory: 0.5*0 + 0.5*4 = {E_via_tower:.4f}')
print(f' Sample mean: {E_empirical:.4f}')
print(f' PASS: {abs(E_empirical - E_via_tower) < 0.05}')
# Law of total variance: Var(X) = E[Var(X|Z)] + Var(E[X|Z])
# E[Var(X|Z)] = 0.5*1 + 0.5*1 = 1 (within-group variance)
# Var(E[X|Z]) = Var(E_XZ) where E_XZ = 0 w.p. 0.5, 4 w.p. 0.5
# E[E_XZ] = 2, Var(E_XZ) = 0.5*(0-2)^2 + 0.5*(4-2)^2 = 4 (between-group)
within_var = 1.0
between_var = 4.0
total_var_theory = within_var + between_var
total_var_sample = X.var()
print(f'\nLaw of total variance: Var(X) = E[Var(X|Z)] + Var(E[X|Z])')
print(f' Within-group: E[Var(X|Z)] = {within_var:.4f}')
print(f' Between-group: Var(E[X|Z]) = {between_var:.4f}')
print(f' Total (theory): {total_var_theory:.4f}')
print(f' Total (sample): {total_var_sample:.4f}')
print(f' PASS: {abs(total_var_sample - total_var_theory) < 0.1}')
Code cell 14
# === 4.2 Conditional Expectation as Optimal Predictor ===
import numpy as np
np.random.seed(42)
N = 50000
# True model: Y = sin(X) + eps, X ~ Uniform(0, 2pi), eps ~ N(0, 0.25)
X = np.random.uniform(0, 2*np.pi, N)
eps = np.random.normal(0, 0.5, N)
Y = np.sin(X) + eps
# Optimal predictor: f*(x) = E[Y|X=x] = sin(x) (since E[eps]=0)
f_star = np.sin(X)
# Compare two predictors:
# 1. f* = sin(x) (conditional expectation)
# 2. f_const = E[Y] = 0 (just the mean, ignoring X)
mse_optimal = np.mean((Y - f_star)**2)
mse_constant = np.mean((Y - Y.mean())**2)
print('Optimal predictor vs constant predictor:')
print(f' MSE(f_star = sin(X)) = {mse_optimal:.4f}')
print(f' MSE(f_const = E[Y]) = {mse_constant:.4f}')
print(f' Ratio: {mse_constant/mse_optimal:.2f}x higher for constant predictor')
print(f' Irreducible noise = Var(eps) = {0.5**2:.4f}')
print(f' f* achieves near-optimal MSE: {abs(mse_optimal - 0.25) < 0.05}')
if HAS_MPL:
plt.figure(figsize=(10, 4))
idx = np.argsort(X)[:500]
plt.scatter(X[idx], Y[idx], s=5, alpha=0.3, color='steelblue', label='Data (Y)')
plt.plot(X[idx], f_star[idx], 'r-', lw=2, label='f*(x) = E[Y|X=x] = sin(x)')
plt.axhline(Y.mean(), color='green', linestyle='--', lw=2, label='Constant predictor E[Y]')
plt.xlabel('X'); plt.ylabel('Y')
plt.title('Conditional Expectation = Optimal Predictor')
plt.legend()
plt.tight_layout()
plt.show()
print('PASS — plotted conditional expectation as optimal predictor')
5. Moment Generating Functions
— differentiating at gives moments: .
Key property: For independent : .
This proves the reproductive properties: sums of Gaussians are Gaussian, sums of Poissons are Poisson.
Code cell 16
# === 5.1 Gaussian MGF and Moment Extraction ===
import math
import numpy as np
# Gaussian MGF: M(t) = exp(mu*t + sigma^2*t^2/2)
mu, sigma = 1.5, 2.0
def mgf_gaussian(t, mu=mu, sigma=sigma):
return np.exp(mu*t + sigma**2*t**2/2)
def taylor_derivatives(f, order=4, radius=0.05, n_points=41):
"""Estimate derivatives at zero by fitting a local Taylor polynomial."""
t = np.linspace(-radius, radius, n_points)
y = np.array([f(float(tt)) for tt in t])
# np.polyfit returns highest degree first; reverse to get c_0, c_1, ...
coeffs = np.polyfit(t, y, deg=order)[::-1]
return [math.factorial(k) * coeffs[k] for k in range(1, order + 1)]
# Extract moments from M_X^{(k)}(0) = E[X^k]
E_X_mgf, E_X2_mgf, E_X3_mgf, E_X4_mgf = taylor_derivatives(mgf_gaussian)
# Theory: E[X^k] for N(mu, sigma^2)
# E[X] = mu, E[X^2] = mu^2 + sigma^2, E[X^3] = mu^3 + 3*mu*sigma^2
# E[X^4] = mu^4 + 6*mu^2*sigma^2 + 3*sigma^4
truth = [
mu,
mu**2 + sigma**2,
mu**3 + 3*mu*sigma**2,
mu**4 + 6*mu**2*sigma**2 + 3*sigma**4,
]
print(f'Moments of N(mu={mu}, sigma={sigma}):')
for k, (got, expected) in enumerate(zip([E_X_mgf, E_X2_mgf, E_X3_mgf, E_X4_mgf], truth), start=1):
print(f' E[X^{k}] : MGF={got:.4f} Theory={expected:.4f} error={abs(got-expected):.2e}')
assert np.allclose([E_X_mgf, E_X2_mgf, E_X3_mgf, E_X4_mgf], truth, rtol=2e-2, atol=2e-2)
print('\nPASS - MGF derivatives match theoretical moments')
# Gaussian reproductive property via MGF
mu1, s1, mu2, s2 = 1.0, 1.5, 2.0, 2.5
# M_{X+Y}(t) = M_X(t)*M_Y(t) = exp((mu1+mu2)t + (s1^2+s2^2)/2 * t^2)
# => X+Y ~ N(mu1+mu2, s1^2+s2^2)
print(f'\nGaussian reproductive property:')
print(f' X ~ N({mu1},{s1}^2), Y ~ N({mu2},{s2}^2)')
print(f' X+Y ~ N({mu1+mu2}, {s1**2+s2**2:.2f})')
np.random.seed(42)
N = 200000
X_samp = np.random.normal(mu1, s1, N)
Y_samp = np.random.normal(mu2, s2, N)
S_samp = X_samp + Y_samp
print(f' Sample mean of X+Y: {S_samp.mean():.4f} (theory: {mu1+mu2})')
print(f' Sample var of X+Y: {S_samp.var():.4f} (theory: {s1**2+s2**2})')
6. Jensen's Inequality: KL Divergence and ELBO
Jensen's inequality: for convex : .
This single inequality underpins:
- (Gibbs' inequality)
- ELBO as a lower bound on
- via Cauchy-Schwarz
- Entropy maximality of the Gaussian
Code cell 18
# === 6.1 Jensen's Inequality: Visual Demonstration ===
import numpy as np
np.random.seed(42)
N = 100000
# X ~ Exponential(1): E[X] = 1
X = np.random.exponential(1, N)
# Convex function: f(x) = x^2
# Jensen: (E[X])^2 <= E[X^2]
EX = X.mean()
E_f = (X**2).mean()
f_EX = EX**2
print('Jensen for f(x)=x^2 (convex):')
print(f' f(E[X]) = {f_EX:.4f}')
print(f' E[f(X)] = {E_f:.4f}')
print(f' f(E[X]) <= E[f(X)]: {f_EX <= E_f}')
print(f' Difference = Var(X) = {E_f - f_EX:.4f} (should be 1 for Exp(1))')
# Concave function: f(x) = log(x)
# Jensen: log(E[X]) >= E[log(X)]
log_EX = np.log(EX)
E_log = np.log(X).mean()
print(f'\nJensen for f(x)=log(x) (concave):')
print(f' log(E[X]) = {log_EX:.4f}')
print(f' E[log(X)] = {E_log:.4f} (theory: E[log Exp(1)] = -gamma ≈ -0.5772)')
print(f' log(E[X]) >= E[log(X)]: {log_EX >= E_log}')
if HAS_MPL:
x_plot = np.linspace(0.01, 4, 300)
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(x_plot, np.log(x_plot), 'b-', lw=2, label='f(x) = log(x) (concave)')
ax.axvline(EX, color='r', linestyle='--', alpha=0.7, label=f'E[X]={EX:.2f}')
ax.scatter([EX], [log_EX], color='red', s=100, zorder=5, label=f'f(E[X])={log_EX:.2f}')
ax.scatter([EX], [E_log], color='green', s=100, zorder=5, label=f'E[f(X)]={E_log:.2f}')
ax.annotate('Jensen gap', xy=(EX, E_log), xytext=(EX+0.5, (log_EX+E_log)/2),
arrowprops=dict(arrowstyle='->', color='gray'))
ax.set_xlabel('x'); ax.legend()
ax.set_title("Jensen's Inequality: log(E[X]) ≥ E[log(X)] for concave log")
plt.tight_layout(); plt.show()
print('PASS — plotted Jensen inequality')
Code cell 19
# === 6.2 KL Divergence Non-Negativity via Jensen ===
import numpy as np
from scipy import stats
np.random.seed(42)
def kl_gaussian(mu1, s1, mu2, s2):
"""KL(N(mu1,s1^2) || N(mu2,s2^2)) analytical formula."""
return np.log(s2/s1) + (s1**2 + (mu1-mu2)**2)/(2*s2**2) - 0.5
# KL = 0 when distributions are equal
print('KL(N(2,1.5) || N(2,1.5)) =', kl_gaussian(2, 1.5, 2, 1.5))
# KL > 0 otherwise
test_cases = [
(0, 1, 0, 1), # equal: KL=0
(0, 1, 1, 1), # shift mean: KL=0.5
(0, 1, 0, 2), # broaden: KL > 0
(0, 2, 0, 1), # narrow: KL > 0
(1, 2, 3, 0.5), # different
]
print('\nKL divergence values (should all be >= 0):')
for mu1, s1, mu2, s2 in test_cases:
kl = kl_gaussian(mu1, s1, mu2, s2)
# Also verify by MC: KL = E_p[log p(X) - log q(X)]
X = np.random.normal(mu1, s1, 500000)
log_p = stats.norm.logpdf(X, mu1, s1)
log_q = stats.norm.logpdf(X, mu2, s2)
kl_mc = (log_p - log_q).mean()
print(f' KL(N({mu1},{s1})||N({mu2},{s2})): analytical={kl:.4f}, MC={kl_mc:.4f}, >=0: {kl >= -1e-10}')
print('\nPASS — KL >= 0 in all cases (Gibbs inequality via Jensen)')
7. Bias-Variance Decomposition
For estimator trained on random dataset :
- Bias = systematic error (underfitting)
- Variance = sensitivity to training data (overfitting)
- Noise = irreducible error
Double descent: overparameterised models can have low variance in the interpolating regime.
Code cell 21
# === 7.1 Bias-Variance Tradeoff: Polynomial Regression ===
import numpy as np
np.random.seed(42)
# True function: y = sin(x) + noise
def true_fn(x): return np.sin(x)
sigma_noise = 0.5
n_train = 20
n_datasets = 500
x_test = np.array([1.5]) # single test point
true_y = true_fn(x_test[0])
degrees = [1, 3, 7, 15]
results = {}
for deg in degrees:
preds = []
for _ in range(n_datasets):
# Sample training data
x_tr = np.random.uniform(0, 3, n_train)
y_tr = true_fn(x_tr) + np.random.normal(0, sigma_noise, n_train)
# Fit polynomial
try:
coefs = np.polyfit(x_tr, y_tr, deg)
pred = np.polyval(coefs, x_test[0])
if np.abs(pred) < 20: # filter extreme predictions
preds.append(pred)
except Exception:
pass
preds = np.array(preds)
bias2 = (preds.mean() - true_y)**2
variance = preds.var()
mse = np.mean((preds - true_y)**2)
results[deg] = {'bias2': bias2, 'variance': variance, 'mse': mse}
print(f'{"Degree":6s} | {"Bias^2":8s} | {"Variance":8s} | {"MSE":8s}')
print('-' * 40)
for deg in degrees:
r = results[deg]
print(f'{deg:6d} | {r["bias2"]:8.4f} | {r["variance"]:8.4f} | {r["mse"]:8.4f}')
print(f'\nIrreducible noise = sigma^2 = {sigma_noise**2:.4f}')
print('Trend: as degree increases, bias decreases but variance increases')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(9, 5))
degs = list(results.keys())
ax.plot(degs, [results[d]['bias2'] for d in degs], 'b-o', label='Bias²', lw=2)
ax.plot(degs, [results[d]['variance'] for d in degs], 'r-o', label='Variance', lw=2)
ax.plot(degs, [results[d]['mse'] for d in degs], 'g-o', label='MSE', lw=2)
ax.axhline(sigma_noise**2, color='gray', linestyle='--', label='Noise floor')
ax.set_xlabel('Polynomial Degree'); ax.set_ylabel('Error')
ax.set_title('Bias-Variance Tradeoff')
ax.legend(); plt.tight_layout(); plt.show()
print('PASS — plotted bias-variance tradeoff')
8. Adam Optimizer as Moment Tracking
Adam maintains exponential moving averages of:
- First moment: (gradient mean)
- Second moment: (gradient variance + mean²)
Bias correction: ,
Update:
Code cell 23
# === 8.1 Adam from Scratch: Moment Tracking ===
import numpy as np
np.random.seed(42)
# Objective: f(theta) = theta^4 - 4*theta^2 + theta
# Minima at approximately theta ≈ -1.25 and theta ≈ 1.23
def f(theta): return theta**4 - 4*theta**2 + theta
def grad_f(theta): return 4*theta**3 - 8*theta + 1
# Adam hyperparameters
alpha = 0.01
beta1, beta2 = 0.9, 0.999
eps = 1e-8
theta = 2.0 # starting point
m, v = 0.0, 0.0
T = 300
theta_history = [theta]
m_history = []
v_history = []
for t in range(1, T+1):
g = grad_f(theta)
m = beta1 * m + (1 - beta1) * g
v = beta2 * v + (1 - beta2) * g**2
m_hat = m / (1 - beta1**t) # bias correction
v_hat = v / (1 - beta2**t)
theta -= alpha * m_hat / (np.sqrt(v_hat) + eps)
theta_history.append(theta)
m_history.append(m_hat)
v_history.append(v_hat)
print(f'Final theta: {theta:.6f}')
print(f'Gradient at final theta: {grad_f(theta):.8f} (should be ~0)')
print(f'f(theta): {f(theta):.6f}')
# Verify bias correction at early steps
print(f'\nBias correction demonstration (step 1):')
g1 = grad_f(2.0)
m1_raw = (1-beta1)*g1 # m after 1 step (starting from 0)
m1_corr = m1_raw / (1-beta1**1)
print(f' g1 = {g1:.4f}')
print(f' m1 (raw) = {m1_raw:.4f} (biased: only {(1-beta1)*100:.0f}% of g1)')
print(f' m1 (corrected) = {m1_corr:.4f} (unbiased: equals g1)')
print(f' PASS: {np.isclose(m1_corr, g1)}')
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
t_arr = np.arange(len(theta_history))
axes[0].plot(t_arr, theta_history, 'b-')
axes[0].set_title('Parameter θ vs step')
axes[0].set_xlabel('Step'); axes[0].set_ylabel('θ')
axes[1].plot(range(1, T+1), m_history, 'r-', label='m̂ (1st moment)')
axes[1].set_title('First moment estimate m̂_t'); axes[1].set_xlabel('Step')
axes[2].plot(range(1, T+1), v_history, 'g-', label='v̂ (2nd moment)')
axes[2].set_title('Second moment estimate v̂_t'); axes[2].set_xlabel('Step')
plt.suptitle('Adam: Moment Tracking in Action')
plt.tight_layout(); plt.show()
print('PASS — plotted Adam moment tracking')
9. Cross-Entropy Loss and ELBO
Cross-entropy loss is the expected negative log-likelihood:
Minimising cross-entropy ≡ minimising .
ELBO is a lower bound on derived via Jensen:
Code cell 25
# === 9.1 Cross-Entropy as Expectation ===
import numpy as np
from scipy.special import softmax
np.random.seed(42)
# Binary classification: true distribution p, model q_theta
# Cross-entropy H(p,q) = -E_p[log q] = -p*log(q) - (1-p)*log(1-q)
p_true = 0.7 # true probability of class 1
# Compute cross-entropy as a function of model prediction q
q_values = np.linspace(0.01, 0.99, 1000)
cross_entropy = -(p_true * np.log(q_values) + (1-p_true) * np.log(1-q_values))
entropy_p = -(p_true*np.log(p_true) + (1-p_true)*np.log(1-p_true))
# KL divergence: H(p,q) = H(p) + KL(p||q), so CE is minimised at q=p
kl = cross_entropy - entropy_p
print('Cross-entropy H(p,q) analysis:')
print(f' Entropy H(p={p_true}) = {entropy_p:.4f} nats')
print(f' Minimum CE = H(p) at q=p: H(p,p) = {-(p_true*np.log(p_true)+(1-p_true)*np.log(1-p_true)):.4f}')
best_q = q_values[np.argmin(cross_entropy)]
print(f' Argmin of CE: q = {best_q:.4f} (should be p = {p_true})')
print(f' PASS: {abs(best_q - p_true) < 0.01}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(8, 5))
ax.plot(q_values, cross_entropy, 'b-', lw=2, label='Cross-entropy H(p,q)')
ax.axhline(entropy_p, color='green', linestyle='--', label=f'Entropy H(p)={entropy_p:.3f}')
ax.axvline(p_true, color='red', linestyle='--', label=f'p_true={p_true}')
ax.set_xlabel('Model prediction q'); ax.set_ylabel('Cross-entropy')
ax.set_title('Cross-Entropy Minimised at q = p_true')
ax.legend(); plt.tight_layout(); plt.show()
print('PASS — plotted cross-entropy')
Code cell 26
# === 9.2 ELBO: VAE KL Divergence Closed Form ===
import numpy as np
from scipy import stats
np.random.seed(42)
# KL(N(mu, sigma^2) || N(0,1)) = 0.5*(mu^2 + sigma^2 - log(sigma^2) - 1)
def kl_normal_prior(mu, sigma):
return 0.5 * (mu**2 + sigma**2 - np.log(sigma**2) - 1)
# Verify KL = 0 at mu=0, sigma=1
print('KL(N(mu,sigma^2) || N(0,1)):')
print(f' KL(N(0,1)||N(0,1)) = {kl_normal_prior(0, 1):.6f} (should be 0)')
# Verify by MC
test_cases = [(0, 1), (0.5, 0.7), (2.0, 0.5), (-1.0, 2.0)]
for mu, sigma in test_cases:
kl_theory = kl_normal_prior(mu, sigma)
# MC: E_q[log q(z) - log p(z)]
z = np.random.normal(mu, sigma, 500000)
log_q = stats.norm.logpdf(z, mu, sigma)
log_p = stats.norm.logpdf(z, 0, 1)
kl_mc = (log_q - log_p).mean()
print(f' KL(N({mu},{sigma}^2)||N(0,1)): theory={kl_theory:.4f}, MC={kl_mc:.4f}, match={np.isclose(kl_theory, kl_mc, atol=0.01)}')
print('\nAll KL >= 0 (Jensen/Gibbs inequality holds):', all(kl_normal_prior(mu, s) >= 0 for mu, s in test_cases))
# ELBO = E_q[log p(x|z)] - KL(q||p)
# For a toy VAE: x = z + noise, z ~ N(0,1) prior
x_obs = 2.5 # observed data point
# Encoder: q_phi(z|x) = N(mu_phi(x), sigma_phi(x)^2)
# Let's compute ELBO as a function of (mu_phi, sigma_phi)
mu_phi = np.linspace(0, 3, 50)
sigma_phi = 0.5
# Reconstruction: assume p(x|z) = N(z, 0.01) (tight decoder)
# E_q[log p(x|z)] ≈ -0.5*(x-mu_phi)^2/0.01 (for small sigma_phi)
sigma_decoder = 0.1
recon = -0.5*((x_obs - mu_phi)**2 + sigma_phi**2) / sigma_decoder**2
kl_term = kl_normal_prior(mu_phi, sigma_phi)
elbo = recon - kl_term
best_mu = mu_phi[np.argmax(elbo)]
print(f'\nToy VAE ELBO maximised at mu_phi = {best_mu:.2f} (close to x_obs={x_obs})')
10. Characteristic Functions and Cumulants
always exists (unlike MGF for heavy tails).
Cumulants from : additive for sums of independent variables.
- (mean)
- (variance)
- = third central moment (skewness × )
- Normal: all for — the defining property of Gaussian
Code cell 28
# === 10.1 Characteristic Function and the CLT Preview ===
import numpy as np
np.random.seed(42)
# Characteristic function of standardised sample mean
# phi_{S_n}(t) = (phi_X(t/sqrt(n)))^n
# As n -> inf, this -> exp(-t^2/2) (standard normal char function)
# This IS the CLT proof sketch!
# Example: X ~ Uniform(-1,1) (not Gaussian)
# Empirical char function: phi(t) ≈ (1/N)*sum(exp(i*t*X_j))
def empirical_cf(samples, t):
return np.mean(np.exp(1j * t * samples))
N_per = 10000
n_list = [1, 2, 5, 20, 100]
t_vals = np.linspace(-5, 5, 100)
# Standard normal char function: phi(t) = exp(-t^2/2)
cf_normal = np.exp(-t_vals**2 / 2)
print('Convergence of standardised sum char function to N(0,1):')
print('(Checking at t=1 where N(0,1) char fn = exp(-0.5) ≈ 0.6065)')
t_check = 1.0
cf_target = np.exp(-t_check**2/2)
for n in n_list:
# Sum of n iid Uniform(-1,1), standardised
samples_list = []
for _ in range(N_per):
xi = np.random.uniform(-1, 1, n)
s = xi.sum() / np.sqrt(n * 1/3) # standardise: Var(Uniform(-1,1))=1/3
samples_list.append(s)
samples_arr = np.array(samples_list)
cf_emp = np.abs(empirical_cf(samples_arr, t_check))
print(f' n={n:4d}: |phi(t=1)| = {cf_emp:.4f} (target = {cf_target:.4f})')
print('\nPASS — char function converges to Gaussian as n increases (CLT preview)')
# Cumulants of Normal: all higher cumulants = 0
print('\nCumulants of N(0,1):')
print(' k1=0 (mean), k2=1 (variance), k3=0, k4=0, ...')
print(' All higher cumulants vanish — this uniquely characterises the Gaussian')
Summary
| Concept | Formula | Key Property |
|---|---|---|
| Expectation (LOTUS) | No need for distribution of | |
| Linearity | Holds WITHOUT independence | |
| Variance | ||
| Tower property | $\mathbb{E}[\mathbb{E}[Y | X]] = \mathbb{E}[Y]$ |
| Total variance | $\text{Var}(Y) = \mathbb{E}[\text{Var}(Y | X)] + \text{Var}(\mathbb{E}[Y |
| MGF | for | |
| Jensen | for convex | , ELBO |
| Cauchy-Schwarz | $ | |
| Bias-Variance | Double descent | |
| Adam | , | Bias correction |
Next: §05 Concentration Inequalities
3. Covariance, Correlation, and the Cauchy-Schwarz Inequality
The covariance matrix encodes all pairwise linear relationships. It is always PSD: .
Pearson correlation: .
Cauchy-Schwarz: , which guarantees .
Note: Zero correlation independence! , have yet is a deterministic function of .
Code cell 31
# === 3.2 Covariance Matrix: Eigendecomposition and PSD ===
import numpy as np
from scipy import stats
np.random.seed(42)
# Generate correlated 3D data
# True covariance matrix
rho12, rho13, rho23 = 0.8, 0.3, -0.5
Sigma_true = np.array([
[1.0, rho12, rho13],
[rho12, 1.0, rho23],
[rho13, rho23, 1.0]
])
# Check PSD
eigvals = np.linalg.eigvalsh(Sigma_true)
print(f'Eigenvalues of Sigma: {eigvals}')
print(f'PSD (all eigvals >= 0): {np.all(eigvals >= -1e-10)}')
# Sample from MVN
N = 50000
X = np.random.multivariate_normal(np.zeros(3), Sigma_true, N)
# Sample covariance matrix
Sigma_hat = np.cov(X.T)
print(f'\nTrue vs sample correlation (off-diag):')
print(f' rho_12: true={rho12:.2f}, sample={Sigma_hat[0,1]:.4f}')
print(f' rho_13: true={rho13:.2f}, sample={Sigma_hat[0,2]:.4f}')
print(f' rho_23: true={rho23:.2f}, sample={Sigma_hat[1,2]:.4f}')
# Correlation vs Covariance
std = np.sqrt(np.diag(Sigma_hat))
R = Sigma_hat / np.outer(std, std)
print(f'\nSample correlation matrix (diagonal = 1 by definition):')
print(np.round(R, 3))
# Cauchy-Schwarz: |Cov(X,Y)|^2 <= Var(X)*Var(Y)
cov12 = Sigma_hat[0, 1]
var1, var2 = Sigma_hat[0,0], Sigma_hat[1,1]
print(f'\nCauchy-Schwarz check:')
print(f' |Cov(X1,X2)|^2 = {cov12**2:.6f}')
print(f' Var(X1)*Var(X2) = {var1*var2:.6f}')
print(f' C-S holds: {cov12**2 <= var1*var2 + 1e-10}')
print('\nPASS — covariance matrix verified')
Code cell 32
# === 3.3 Zero Correlation Does Not Imply Independence ===
import numpy as np
np.random.seed(42)
N = 100000
# Classic counterexample: X ~ N(0,1), Y = X^2
X = np.random.normal(0, 1, N)
Y = X**2
cov_XY = np.cov(X, Y)[0, 1]
corr = np.corrcoef(X, Y)[0, 1]
print(f'X ~ N(0,1), Y = X^2:')
print(f' Cov(X, Y) = {cov_XY:.6f} (theory: E[X^3] - E[X]E[X^2] = 0)')
print(f' Corr(X, Y) = {corr:.6f} (zero!)')
# But they are DEPENDENT: knowing |X| tells you Y exactly
# Test: E[Y | X > 0] vs E[Y | X < 0]
E_Y_pos = Y[X > 0].mean() # E[X^2 | X > 0]
E_Y_neg = Y[X < 0].mean() # E[X^2 | X < 0]
E_Y_all = Y.mean()
print(f'\n E[Y] = {E_Y_all:.4f} (same regardless of sign of X — as expected)')
print(f' E[Y | X>0] = {E_Y_pos:.4f}')
print(f' E[Y | X<0] = {E_Y_neg:.4f}')
# Mutual information would reveal dependence
# X and Y are clearly dependent: Var[Y|X] = 0 (Y = X^2 exactly)
# Test: P(Y < 1) vs P(Y < 1 | X > 1.5)
p_Y_lt1 = (Y < 1).mean()
p_Y_lt1_given_X_gt15 = (Y[X > 1.5] < 1).mean()
print(f'\n P(Y < 1) = {p_Y_lt1:.4f} (theory: P(|X| < 1) ≈ 0.683)')
print(f' P(Y < 1 | X>1.5) = {p_Y_lt1_given_X_gt15:.4f} (impossible: X>1.5 => Y>2.25!)')
print(f'\nConclusion: Cov=0 does NOT imply independence!')
print('PASS — counterexample verified')
11. Score Function and Fisher Information
The score function is .
Key identity: — the expected score is zero.
Proof:
Fisher information: .
Cramér-Rao bound: For any unbiased estimator , .
AI applications:
- REINFORCE / policy gradient:
- Natural gradient: uses to precondition gradient updates
- Diffusion score models: learn (the score of the noised distribution)
Code cell 34
# === 11.1 Score Function Identity and Fisher Information ===
import numpy as np
from scipy import stats
np.random.seed(42)
# Verify: E[score] = 0 for Gaussian
# p(x; mu, sigma) = N(mu, sigma^2)
# log p = -0.5*log(2*pi) - log(sigma) - (x-mu)^2 / (2*sigma^2)
# score wrt mu: (x - mu) / sigma^2
mu, sigma = 2.0, 1.5
N = 200000
X = np.random.normal(mu, sigma, N)
score_mu = (X - mu) / sigma**2
score_sigma = (X - mu)**2 / sigma**3 - 1.0 / sigma
print('Score function identity: E[score] = 0')
print(f' E[score_mu] = {score_mu.mean():.6f} (theory: 0)')
print(f' E[score_sigma] = {score_sigma.mean():.6f} (theory: 0)')
# Fisher information for Gaussian
# I(mu) = E[score_mu^2] = 1/sigma^2
# I(sigma) = E[score_sigma^2] = 2/sigma^2
I_mu_sample = (score_mu**2).mean()
I_sigma_sample = (score_sigma**2).mean()
print(f'\nFisher information:')
print(f' I(mu): sample={I_mu_sample:.4f}, theory={1/sigma**2:.4f}')
print(f' I(sigma): sample={I_sigma_sample:.4f}, theory={2/sigma**2:.4f}')
# Cramer-Rao: Var(X_bar) >= 1/n * I(mu)^{-1} = sigma^2/n
n = 100
var_xbar_theory = sigma**2 / n
xbars = [np.random.normal(mu, sigma, n).mean() for _ in range(5000)]
var_xbar_sample = np.var(xbars)
print(f'\nCramer-Rao bound check (n={n}):')
print(f' Var(X_bar) sample = {var_xbar_sample:.6f}')
print(f' CR lower bound = {var_xbar_theory:.6f} (sigma^2/n)')
print(f' Bound satisfied: {var_xbar_sample >= var_xbar_theory - 0.001}')
print(' (X_bar achieves the bound — it is the MLE, hence efficient)')
print('\nPASS — score function and Fisher information verified')
Code cell 35
# === 11.2 Policy Gradient (REINFORCE) via Score Function Trick ===
import numpy as np
np.random.seed(42)
# Simple bandit: action a ~ Categorical(softmax(theta))
# Reward: r(a) = -|a - 2| (best action = 2)
# Objective: J(theta) = E_{a~pi_theta}[r(a)]
# Gradient (REINFORCE): grad_theta J = E[r(a) * grad_theta log pi(a|theta)]
def softmax(x):
e = np.exp(x - x.max())
return e / e.sum()
def policy_gradient_step(theta, n_samples=500, lr=0.05):
probs = softmax(theta)
K = len(theta)
# Sample actions
actions = np.random.choice(K, n_samples, p=probs)
# Compute rewards
rewards = -np.abs(actions - 2).astype(float)
# REINFORCE gradient: sum_a [ r(a) * (1{a=k} - pi_k) ] / n_samples
grad = np.zeros(K)
for k in range(K):
score_k = ((actions == k).astype(float) - probs[k]) # grad log pi
grad[k] = (rewards * score_k).mean()
return theta + lr * grad, rewards.mean()
K = 5 # actions 0..4
theta = np.zeros(K)
rewards_history = []
for step in range(100):
theta, avg_reward = policy_gradient_step(theta)
rewards_history.append(avg_reward)
final_probs = softmax(theta)
print('Policy gradient (REINFORCE) on 5-arm bandit:')
print(f' Optimal action = 2 (reward = 0)')
print(f' Final policy: {final_probs.round(4)}')
print(f' Best action: {final_probs.argmax()} (prob={final_probs.max():.4f})')
print(f' Initial avg reward: {rewards_history[0]:.4f}')
print(f' Final avg reward: {rewards_history[-1]:.4f}')
print(f'\nGradient formula: grad J = E_a[r(a) * grad_theta log pi(a;theta)]')
print('PASS — policy gradient converged to optimal action')
12. Stein's Lemma and the Reparameterisation Trick
Stein's lemma: If and is differentiable, then
This is essential for: Gaussian integration by parts, deriving the score function, and proving properties of the normal distribution.
Reparameterisation trick: To compute when , write , . Then:
This enables backpropagation through stochastic layers — the foundation of VAEs.
Code cell 37
# === 12.1 Stein's Lemma: Numerical Verification ===
import numpy as np
np.random.seed(42)
N = 500000
mu, sigma = 1.5, 2.0
X = np.random.normal(mu, sigma, N)
# Stein's lemma: E[g(X)(X-mu)] = sigma^2 * E[g'(X)]
# Test with g(x) = x^2 => g'(x) = 2x
# LHS: E[X^2 * (X - mu)]
# RHS: sigma^2 * E[2X] = 2 * sigma^2 * mu
g = X**2
g_prime = 2 * X
LHS = (g * (X - mu)).mean()
RHS = sigma**2 * g_prime.mean()
theory_RHS = sigma**2 * 2 * mu
print("Stein's lemma: E[g(X)(X-mu)] = sigma^2 * E[g'(X)]")
print(f" g(x) = x^2, mu={mu}, sigma={sigma}")
print(f" LHS (sample): {LHS:.4f}")
print(f" RHS (sample): {RHS:.4f}")
print(f" RHS (theory): {theory_RHS:.4f} (= 2*sigma^2*mu)")
print(f" Match: {abs(LHS - RHS) < 0.1}")
# Reparameterisation trick demo
# grad_mu E_{N(mu,sigma^2)}[f(z)] where f(z) = z^2
# = E_epsilon[ grad_mu f(mu + sigma*eps) ] = E_epsilon[2*(mu + sigma*eps)] = 2*mu
eps = np.random.normal(0, 1, N)
z = mu + sigma * eps
grad_via_reparam = (2 * z).mean() # grad_mu of z^2 = 2*z, since dz/dmu=1
grad_theory = 2 * mu
print(f"\nReparameterisation trick: grad_mu E[z^2]")
print(f" Via reparam (sample): {grad_via_reparam:.4f}")
print(f" Theory: 2*mu = {grad_theory:.4f}")
print(f" Match: {abs(grad_via_reparam - grad_theory) < 0.05}")
print("PASS — Stein's lemma and reparameterisation trick verified")
13. Preview: From Moments to Tail Bounds
Expectation and variance give us more than just summary statistics — they directly bound how far a random variable can stray from its mean.
Preview: Markov's and Chebyshev's Inequalities
Markov's inequality: For , . Proof: , take expectations.
Chebyshev's inequality: . Proof: Apply Markov to with threshold .
These are the simplest concentration results. Tighter bounds (Hoeffding, Chernoff) > that exploit bounded or sub-Gaussian distributions are developed in full in:
→ Full treatment: §05 Concentration Inequalities
Code cell 39
# === 13.1 Markov and Chebyshev: Quick Verification ===
import numpy as np
from scipy import stats
np.random.seed(42)
N = 500000
# Exponential(1): mean=1, var=1
lam = 1.0
X = np.random.exponential(1.0/lam, N)
mu = X.mean()
sigma = X.std()
print('Markov inequality: P(X >= t) <= E[X]/t')
for t in [2, 3, 5]:
empirical = (X >= t).mean()
markov_bound = mu / t
theory = np.exp(-lam * t) # exact for Exp(lambda)
print(f' t={t}: P(X>={t})={empirical:.4f}, Markov={markov_bound:.4f}, '
f'Exact={theory:.4f} bound_holds={empirical <= markov_bound + 1e-4}')
print('\nChebyshev inequality: P(|X - mu| >= k*sigma) <= 1/k^2')
for k in [2, 3, 4]:
empirical = (np.abs(X - mu) >= k * sigma).mean()
cheby_bound = 1.0 / k**2
print(f' k={k}: empirical={empirical:.4f}, Chebyshev bound={cheby_bound:.4f}, '
f'bound_holds={empirical <= cheby_bound + 1e-4}')
print('\nNote: bounds hold for ANY distribution — very conservative for Exponential')
print('For tighter bounds: Hoeffding, Chernoff -> see §05 Concentration Inequalities')
print('PASS — Markov and Chebyshev verified')
14. Preview: Law of Large Numbers and Central Limit Theorem
The expectation operator has a profound empirical meaning: as we collect more samples, the sample mean converges to the true expectation.
Preview: Law of Large Numbers
If are i.i.d. with , then
Proof sketch: By Chebyshev, .
Preview: Central Limit Theorem
Under the same conditions (finite variance ),
→ Full treatment: §06 Stochastic Processes
Code cell 41
# === 14.1 LLN and CLT: Empirical Demonstration ===
import numpy as np
from scipy import stats
np.random.seed(42)
# LLN: sample mean of Exp(1) converges to E[X] = 1
ns = [1, 5, 10, 50, 100, 500, 1000, 5000]
X = np.random.exponential(1.0, 10000)
print('Law of Large Numbers: sample mean of Exp(1) -> E[X] = 1.0')
for n in ns:
xbar = X[:n].mean()
print(f' n={n:5d}: X_bar = {xbar:.4f}')
# CLT: standardised mean of Bernoulli(p) -> N(0,1)
print('\nCentral Limit Theorem: sqrt(n)*(X_bar - mu)/sigma -> N(0,1)')
p = 0.3 # Bernoulli(0.3)
mu_bern = p
sigma_bern = np.sqrt(p * (1-p))
n_clt = 50
n_reps = 10000
# Compute standardised sample means
samples = np.random.binomial(1, p, (n_reps, n_clt))
xbars = samples.mean(axis=1)
Z = np.sqrt(n_clt) * (xbars - mu_bern) / sigma_bern
# Test normality
ks_stat, ks_pval = stats.kstest(Z, 'norm')
print(f' Distribution: Bernoulli({p}), n={n_clt}, reps={n_reps}')
print(f' Standardised mean: mean={Z.mean():.4f}, std={Z.std():.4f}')
print(f' KS test vs N(0,1): stat={ks_stat:.4f}, p-value={ks_pval:.4f}')
print(f' Normal? {ks_pval > 0.01} (KS p>0.01 fails to reject normality)')
print('\n=> For full theory (weak LLN, strong LLN, CLT proof): §06 Stochastic Processes')
print('PASS — LLN and CLT preview demonstrated')
Summary: Key Results in Expectation and Moments
| Result | Formula | Application |
|---|---|---|
| LOTUS | Computing moments, MGF | |
| Linearity | Binomial mean, indicator trick | |
| Variance | Risk, regularisation | |
| Tower property | $\mathbb{E}[\mathbb{E}[Y | X]] = \mathbb{E}[Y]$ |
| Law of total variance | $\text{Var}(Y) = \mathbb{E}[\text{Var}(Y | X)] + \text{Var}(\mathbb{E}[Y |
| Jensen's inequality | for convex | KL ≥ 0, ELBO |
| Cauchy-Schwarz | $ | \text{Cov}(X,Y) |
| Score identity | Policy gradient, NaturalGrad | |
| Stein's lemma | Score matching, Gaussian int-by-parts | |
| Markov bound | → §05 for sharper bounds | |
| Reparameterisation | , | VAE, diffusion backprop |
| Bias-variance | Model selection, double descent |
3. (cont.) Visualising Covariance: Ellipses and Principal Axes
The covariance matrix determines the shape of a multivariate Gaussian's level sets. Its eigenvectors give the principal axes and eigenvalues give the spread along each axis.
This directly underpins PCA (find axes of maximum variance) and whitening (transform so that ).
Code cell 44
# === 3.4 Covariance Ellipses and PCA Intuition ===
import numpy as np
np.random.seed(42)
N = 500
# Manually build covariance matrices with different correlations
configs = [
('No correlation (rho=0)', np.array([[1.0, 0.0], [0.0, 1.0]])),
('Pos. correlation (rho=.8)', np.array([[1.0, 0.8], [0.8, 1.0]])),
('Neg. correlation (rho=-.6)',np.array([[1.0,-0.6], [-0.6,1.0]])),
('Unequal variance', np.array([[3.0, 0.5], [0.5, 0.5]])),
]
for name, Sigma in configs:
eigvals, eigvecs = np.linalg.eigh(Sigma)
X = np.random.multivariate_normal([0, 0], Sigma, N)
sample_cov = np.cov(X.T)
print(f'{name}')
print(f' Eigenvalues: {eigvals.round(3)}')
print(f' Cov match: {np.allclose(sample_cov, Sigma, atol=0.15)}')
# PCA demo: project onto principal axes
Sigma = np.array([[2.0, 1.5], [1.5, 2.0]])
X = np.random.multivariate_normal([0, 0], Sigma, 1000)
eigvals, eigvecs = np.linalg.eigh(Sigma)
# Sort descending
idx = np.argsort(-eigvals)
eigvals, eigvecs = eigvals[idx], eigvecs[:, idx]
Z = X @ eigvecs # project onto principal axes (whitened directions)
print(f'\nPCA projection variance:')
print(f' PC1 variance: sample={Z[:,0].var():.3f}, theory={eigvals[0]:.3f}')
print(f' PC2 variance: sample={Z[:,1].var():.3f}, theory={eigvals[1]:.3f}')
print(f' PC1 direction: {eigvecs[:,0].round(3)} (45-degree line)')
print(f'\nPCA extracts directions of maximum variance via covariance eigendecomposition')
print('PASS — covariance ellipses and PCA verified')
5. (cont.) MGF of Binomial and Poisson
The MGF uniquely characterises a distribution (when it exists) and provides a clean proof that the sum of independent Poisson random variables is Poisson.
Binomial: , giving , .
Poisson: , giving .
Reproductive property: If and independently, then , so .
Code cell 46
# === 5.2 MGF: Binomial and Poisson ===
import numpy as np
from scipy import stats
np.random.seed(42)
N = 200000
# Binomial MGF: M(t) = (1-p+p*e^t)^n
n_binom, p_binom = 15, 0.4
ts = np.array([0.1, 0.3, 0.5])
print('Binomial MGF: M(t) = (1-p+p*exp(t))^n')
X_binom = np.random.binomial(n_binom, p_binom, N)
for t in ts:
mgf_sample = np.exp(t * X_binom).mean()
mgf_theory = (1 - p_binom + p_binom * np.exp(t))**n_binom
print(f' t={t}: sample={mgf_sample:.4f}, theory={mgf_theory:.4f}, '
f'match={abs(mgf_sample-mgf_theory)<0.01}')
# Moments from MGF differentiation:
# M'(0) = E[X] = n*p
# M''(0) = E[X^2], Var = M''(0) - (M'(0))^2
mu_theory = n_binom * p_binom
var_theory = n_binom * p_binom * (1 - p_binom)
print(f' Mean: theory={mu_theory:.2f}, sample={X_binom.mean():.4f}')
print(f' Var: theory={var_theory:.2f}, sample={X_binom.var():.4f}')
# Poisson MGF and reproductive property
lam1, lam2 = 3.0, 5.0
X1 = np.random.poisson(lam1, N)
X2 = np.random.poisson(lam2, N)
S = X1 + X2
print(f'\nPoisson reproductive property: Pois({lam1})+Pois({lam2}) ~ Pois({lam1+lam2})')
print(f' S mean: {S.mean():.4f} (theory: {lam1+lam2})')
print(f' S var: {S.var():.4f} (theory: {lam1+lam2})')
# KS test: S vs Poisson(lam1+lam2)
expected_pmf = stats.poisson.pmf(np.arange(30), lam1+lam2)
sample_counts, _ = np.histogram(S, bins=np.arange(31))
sample_pmf = sample_counts / N
print(f' Total var dist (S vs Pois({lam1+lam2})): {np.abs(sample_pmf - expected_pmf).sum()/2:.4f}')
print('PASS — Binomial and Poisson MGFs verified')
8. (cont.) Neural Network Initialisation via Moment Analysis
Weight initialisation is a moment-matching problem. The goal is to preserve signal variance through the network: .
For a layer with i.i.d. and i.i.d.:
For variance preservation , we need .
- Xavier init: — optimal for linear/tanh
- He init: — accounts for ReLU halving variance (only half neurons active)
- Adam : relate to the timescale over which the first/second moment estimates track the gradient distribution
Code cell 48
# === 8.2 Weight Initialisation: Moment-Based Analysis ===
import numpy as np
np.random.seed(42)
def relu(x):
return np.maximum(0, x)
# Simulate signal propagation through a deep network
n_layers = 20
n_neurons = 512
n_samples = 100
results = {}
for init_name, scale_fn in [
('Tiny (0.01)', lambda n: 0.01),
('Too large (1.0)',lambda n: 1.0),
('Xavier (1/n)', lambda n: np.sqrt(1.0/n)),
('He (2/n)', lambda n: np.sqrt(2.0/n)),
]:
x = np.random.randn(n_samples, n_neurons)
vars_per_layer = [x.var()]
for _ in range(n_layers):
W = np.random.randn(n_neurons, n_neurons) * scale_fn(n_neurons)
x = relu(x @ W)
vars_per_layer.append(x.var())
results[init_name] = vars_per_layer
print(f'{init_name:20s}: layer0={vars_per_layer[0]:.3f}, '
f'layer10={vars_per_layer[10]:.6f}, layer20={vars_per_layer[20]:.6f}')
print('\nConclusion:')
print(' Tiny init -> vanishing activations (all zeros after few layers)')
print(' Large init -> exploding activations (overflow after few layers)')
print(' Xavier: for linear, tanh. He: for ReLU (accounts for E[ReLU output^2] = Var/2)')
print('\nHe init derivation:')
print(' Var(ReLU(z)) = Var(z)/2 (only positive half contributes)')
print(' Need: n * sigma_w^2 * Var(z)/2 = Var(z)')
print(' => sigma_w^2 = 2/n (He initialisation)')
print('PASS — initialisation moment analysis demonstrated')
8. (cont.) Batch Normalisation as Moment Normalisation
BatchNorm explicitly normalises activations to have zero mean and unit variance within each mini-batch:
where and .
By the CLT (§06), as batch size , and . The stochasticity at finite batch size acts as a regulariser.
Learnable parameters then rescale and shift: .
Code cell 50
# === 8.3 Batch Normalisation: Moment Computation ===
import numpy as np
np.random.seed(42)
def batch_norm(x, gamma=1.0, beta=0.0, eps=1e-5):
"""BatchNorm along first axis (batch dimension)."""
mu = x.mean(axis=0)
sigma2 = x.var(axis=0)
x_hat = (x - mu) / np.sqrt(sigma2 + eps)
return gamma * x_hat + beta, mu, sigma2
# Simulate a layer with non-zero mean and large variance
batch_size = 32
n_features = 64
# Activations before BN: mean ~3, std ~2
X_raw = np.random.normal(3.0, 2.0, (batch_size, n_features))
print(f'Before BatchNorm:')
print(f' Mean: {X_raw.mean():.3f} (over all elements)')
print(f' Std: {X_raw.std():.3f}')
X_bn, mu_batch, var_batch = batch_norm(X_raw)
print(f'\nAfter BatchNorm:')
print(f' Mean: {X_bn.mean():.6f} (should be ~0)')
print(f' Std: {X_bn.std():.6f} (should be ~1)')
# Effect of batch size on estimation quality
print(f'\nBatch size effect on moment estimation quality (true mean=3, std=2):')
for bs in [4, 8, 16, 32, 64, 256]:
X = np.random.normal(3.0, 2.0, (1000, bs))
mu_hat = X.mean(axis=1) # 1000 independent estimates
std_est = mu_hat.std()
print(f' bs={bs:4d}: std of mu_hat = {std_est:.4f} (CLT: sigma/sqrt(n) = {2.0/np.sqrt(bs):.4f})')
print('\nSmall batches: noisy moments -> implicit regularisation (noise as feature)')
print('Large batches: accurate moments -> stable but may overfit')
print('PASS — batch normalisation moments verified')
Code cell 51
# === 8.4 Layer Norm vs RMS Norm: Moment Variants ===
import numpy as np
np.random.seed(42)
def layer_norm(x, gamma=None, beta=None, eps=1e-5):
"""Normalise over feature dimension (each sample independently)."""
mu = x.mean(axis=-1, keepdims=True)
sigma2 = x.var(axis=-1, keepdims=True)
x_hat = (x - mu) / np.sqrt(sigma2 + eps)
if gamma is not None:
x_hat = gamma * x_hat + beta
return x_hat
def rms_norm(x, gamma=None, eps=1e-5):
"""RMS Norm: no mean subtraction (used in LLaMA, Mistral)."""
rms = np.sqrt((x**2).mean(axis=-1, keepdims=True) + eps)
x_hat = x / rms
if gamma is not None:
x_hat = gamma * x_hat
return x_hat
# Single token embedding (typical in transformers)
d_model = 512
batch = 8
X = np.random.normal(0, 3.0, (batch, d_model)) # vary std
X_ln = layer_norm(X)
X_rms = rms_norm(X)
print('Layer Norm (normalises mean AND variance):')
print(f' Mean per sample (should be 0): {X_ln.mean(axis=-1).round(4)}')
print(f' Std per sample (should be 1): {X_ln.std(axis=-1).round(4)}')
print('\nRMS Norm (normalises RMS = sqrt(E[X^2]), no mean subtraction):')
print(f' RMS per sample (should be 1): {np.sqrt((X_rms**2).mean(axis=-1)).round(4)}')
print(f' Mean per sample (NOT zeroed): {X_rms.mean(axis=-1).round(4)}')
print('\nKey insight:')
print(' LayerNorm: x_hat = (x - E[x]) / sqrt(Var(x)+eps) — uses 1st AND 2nd moment')
print(' RMSNorm: x_hat = x / sqrt(E[x^2]+eps) — uses 2nd moment only')
print(' RMSNorm is faster (no mean computation) and used in LLaMA/Mistral')
print('PASS — LayerNorm and RMSNorm moment analysis')
5. (cont.) MGF and Sums of Independent Variables
The most powerful use of MGFs: if and are independent, then
This immediately gives:
- Sum of i.i.d. \sim \text{Gamma}(n, \lambda)$
- Sum of i.i.d. \sim \mathcal{N}(n\mu, n\sigma^2)$
- Sum of independent Poissons is Poisson
The log-MGF is the cumulant generating function — its derivatives give cumulants (mean, variance, skewness, kurtosis).
Code cell 53
# === 5.3 MGF of Sums and Cumulant Generating Functions ===
import numpy as np
from scipy import stats
np.random.seed(42)
N = 200000
# Cumulants of Normal: K(t) = mu*t + sigma^2*t^2/2
# K'(0) = mu (first cumulant = mean)
# K''(0) = sigma^2 (second cumulant = variance)
# K'''(0) = 0 (third cumulant = 0 for Gaussian)
mu, sigma = 2.0, 1.5
X = np.random.normal(mu, sigma, N)
# Numerical derivative of log MGF
dt = 1e-4
log_mgf = lambda t: np.log(np.exp(t * X).mean())
kappa1 = (log_mgf(dt) - log_mgf(-dt)) / (2*dt)
kappa2 = (log_mgf(dt) - 2*log_mgf(0) + log_mgf(-dt)) / dt**2
print('Cumulant generating function K(t) = log M(t):')
print(f' K\'(0) = {kappa1:.4f} (theory: mu = {mu})')
print(f' K"(0) = {kappa2:.4f} (theory: sigma^2 = {sigma**2:.4f})')
# MGF product property: sum of n Exp(lambda) ~ Gamma(n, lambda)
lam = 2.0
n_exp = 4
X_exp = [np.random.exponential(1.0/lam, N) for _ in range(n_exp)]
S = sum(X_exp) # Should be Gamma(n, lambda)
# Gamma(n, lam) has mean=n/lam, var=n/lam^2
print(f'\nSum of {n_exp} Exp({lam}) ~ Gamma({n_exp}, {lam}):')
print(f' Mean: sample={S.mean():.4f}, theory={n_exp/lam:.4f}')
print(f' Var: sample={S.var():.4f}, theory={n_exp/lam**2:.4f}')
# KS test against Gamma
ks_stat, ks_pval = stats.kstest(S, lambda x: stats.gamma.cdf(x, a=n_exp, scale=1.0/lam))
print(f' KS test vs Gamma({n_exp}, scale={1/lam:.2f}): p={ks_pval:.4f} '
f'(fails to reject: {ks_pval > 0.01})')
print('PASS — MGF product property and cumulants verified')
7. (cont.) Double Descent and the Interpolation Threshold
Classical bias-variance predicts a single optimal model complexity. Modern overparameterised models (transformers, large networks) exhibit double descent:
- Classical regime (): Bias↓ and Variance↑ as complexity increases
- Interpolation threshold (): Model just fits training data — worst point
- Overparameterised regime (): Both bias and variance can decrease!
Why? Among all models that perfectly interpolate the training data, minimum-norm solutions (implicit regularisation of GD) automatically favour smooth, low-variance predictors.
This reconciles the empirical success of over-parameterisation with statistical theory.
Code cell 55
# === 7.2 Double Descent: Simulation ===
import numpy as np
np.random.seed(42)
# Generate data from y = sin(2*pi*x) + noise
def true_fn(x):
return np.sin(2 * np.pi * x)
n_train = 20
n_test = 500
sigma_noise = 0.3
X_train = np.linspace(0, 1, n_train)
y_train = true_fn(X_train) + np.random.normal(0, sigma_noise, n_train)
X_test = np.linspace(0, 1, n_test)
y_test = true_fn(X_test)
# Fit polynomial of degree d using least-squares / minimum-norm
def poly_features(X, d):
return np.column_stack([X**k for k in range(d+1)])
def fit_predict(d, X_tr, y_tr, X_te):
Phi_tr = poly_features(X_tr, d)
Phi_te = poly_features(X_te, d)
# Use lstsq (min-norm for overdetermined, min-norm pinv for underdetermined)
w, _, _, _ = np.linalg.lstsq(Phi_tr, y_tr, rcond=None)
return Phi_te @ w
degrees = [1, 3, 5, 10, 15, 19, 20, 25, 30, 40]
test_mses = []
for d in degrees:
try:
y_pred = fit_predict(d, X_train, y_train, X_test)
mse = np.mean((y_pred - y_test)**2)
except Exception:
mse = float('nan')
test_mses.append(mse)
region = 'classical' if d < n_train else 'overparameterised'
print(f' degree={d:3d} (params={d+1:3d}, {region:20s}): test MSE={mse:.4f}')
peak_deg = degrees[np.nanargmax(test_mses)]
best_deg = degrees[np.nanargmin(test_mses)]
print(f'\nInterpolation threshold (~n={n_train}): worst at degree={peak_deg}')
print(f'Best model: degree={best_deg} (test MSE={min(mse for mse in test_mses if not np.isnan(mse)):.4f})')
print('Double descent: overparameterised models can be better than underfitting ones')
print('PASS — double descent demonstrated')