Theory Notebook
Converted from
theory.ipynbfor web reading.
§6.1 Introduction and Random Variables
"Probability is not about chance — it is about the precise language for reasoning under uncertainty."
Interactive companion notebook. Work through the cells in order; each section mirrors the corresponding
section of notes.md. Run a cell with Shift+Enter.
Sections:
- Sample Spaces and Probability Axioms
- Conditional Probability and Bayes' Theorem
- Independence
- Random Variables, CDF, PDF, PMF
- Discrete Distributions (Bernoulli, Geometric)
- Continuous Distributions (Uniform, Gaussian)
- Expectation and Variance
- Transformations and the Change-of-Variables Formula
- ML Applications: Cross-Entropy and NLL
- Monte Carlo Simulation
- Information Theory: Entropy and KL Divergence
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
from scipy import stats
import math
try:
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [10, 5]
plt.rcParams['font.size'] = 12
HAS_MPL = True
except ImportError:
HAS_MPL = False
try:
import seaborn as sns
HAS_SNS = True
except ImportError:
HAS_SNS = False
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete.')
1. Sample Spaces and Probability Axioms
We model every random experiment by specifying:
- Sample space : the set of all possible outcomes
- Events : subsets we care about
- Probability : a function satisfying Kolmogorov's three axioms
Axiom 1: for all events
Axiom 2: (normalisation)
Axiom 3: For disjoint events:
Code cell 5
# === 1.1 Finite Sample Spaces ===
# Model a fair 6-sided die
Omega = {1, 2, 3, 4, 5, 6}
# Uniform probability assignment
def P_uniform(event, sample_space):
"""P(A) = |A| / |Omega| for equally likely outcomes."""
return len(event) / len(sample_space)
# Events
A = {2, 4, 6} # even number
B = {1, 2, 3} # at most 3
A_and_B = A & B # intersection
A_or_B = A | B # union
print(f'P(even) = {P_uniform(A, Omega):.4f}')
print(f'P(at most 3) = {P_uniform(B, Omega):.4f}')
print(f'P(A ∩ B) = {P_uniform(A_and_B, Omega):.4f}')
print(f'P(A ∪ B) = {P_uniform(A_or_B, Omega):.4f}')
# Verify inclusion-exclusion: P(A∪B) = P(A) + P(B) - P(A∩B)
ie = P_uniform(A, Omega) + P_uniform(B, Omega) - P_uniform(A_and_B, Omega)
print(f'\nInclusion-exclusion check: {ie:.4f}')
assert abs(P_uniform(A_or_B, Omega) - ie) < 1e-12, 'Inclusion-exclusion failed'
print('PASS - inclusion-exclusion verified')
Code cell 6
# === 1.2 Union Bound (Boole's Inequality) ===
# Demonstrate that P(A∪B) ≤ P(A) + P(B)
import itertools
def union_bound_demo(p_a, p_b, n_outcomes=1000):
"""Show union bound holds for random events with given marginal probs."""
# Create events with approximately the right marginal probabilities
Omega_large = np.arange(n_outcomes)
A_mask = np.random.rand(n_outcomes) < p_a
B_mask = np.random.rand(n_outcomes) < p_b
p_A = A_mask.mean()
p_B = B_mask.mean()
p_AuB = (A_mask | B_mask).mean()
p_AnB = (A_mask & B_mask).mean()
print(f' P(A) = {p_A:.4f} (target {p_a})')
print(f' P(B) = {p_B:.4f} (target {p_b})')
print(f' P(A∪B) = {p_AuB:.4f}')
print(f' P(A)+P(B)= {p_A+p_B:.4f} (union bound)')
print(f' P(A∩B) = {p_AnB:.4f}')
print(f' Union bound holds: {p_AuB <= p_A + p_B + 1e-10}')
return p_AuB, p_A + p_B
np.random.seed(42)
print('Union bound demonstration:')
actual, bound = union_bound_demo(0.3, 0.4)
print(f'\nSlack = {bound - actual:.4f} (the probability of the intersection)')
2. Conditional Probability and Bayes' Theorem
The conditional probability of given (with ):
Chain rule:
Bayes' theorem:
Law of total probability: If partitions :
Code cell 8
# === 2.1 Medical Test — Bayes' Theorem ===
# Disease prevalence: 1%
# Test sensitivity P(positive | sick) = 0.99
# Test specificity P(negative | healthy) = 0.95 => P(positive | healthy) = 0.05
p_disease = 0.01 # prior
p_pos_given_sick = 0.99 # sensitivity
p_pos_given_healthy = 0.05 # 1 - specificity
# Law of total probability
p_healthy = 1 - p_disease
p_positive = (p_pos_given_sick * p_disease +
p_pos_given_healthy * p_healthy)
# Bayes' theorem
p_sick_given_pos = p_pos_given_sick * p_disease / p_positive
print(f'P(disease) = {p_disease:.4f}')
print(f'P(positive) = {p_positive:.4f}')
print(f'P(sick | positive) = {p_sick_given_pos:.4f} ({100*p_sick_given_pos:.1f}%)')
print(f'P(healthy | positive) = {1-p_sick_given_pos:.4f} ({100*(1-p_sick_given_pos):.1f}%)')
print()
print('Interpretation: Even with a positive test, only ~16.5% chance of disease')
print('=> Low base rate makes the test result much weaker than its accuracy implies')
Code cell 9
# === 2.2 Sequential Bayes Updating ===
# Start with prior, update as positive tests accumulate
def bayes_update(prior, likelihood_pos, likelihood_neg):
"""Update belief P(H) given a positive observation."""
p_pos = likelihood_pos * prior + likelihood_neg * (1 - prior)
posterior = likelihood_pos * prior / p_pos
return posterior
prior = 0.01
posteriors = [prior]
for i in range(10):
prior = bayes_update(prior, 0.99, 0.05)
posteriors.append(prior)
print(f'After {i+1} positive test(s): P(sick) = {prior:.4f}')
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(range(len(posteriors)), posteriors, 'o-', color='steelblue', lw=2)
ax.axhline(0.5, color='red', ls='--', alpha=0.5, label='50% threshold')
ax.set_xlabel('Number of positive tests')
ax.set_ylabel('P(disease | data)')
ax.set_title('Sequential Bayesian Updating')
ax.legend()
plt.tight_layout()
plt.show()
3. Independence
Events and are independent if:
Equivalent: — knowing tells you nothing about .
Mutual independence of : every subset satisfies the product rule. Pairwise independence does NOT imply mutual independence.
Code cell 11
# === 3.1 Pairwise Independent but Not Mutually Independent ===
# Classic example: flip two fair coins X, Y; define Z = X XOR Y
# X, Y, Z are pairwise independent but NOT mutually independent
outcomes = [(x, y) for x in [0, 1] for y in [0, 1]]
prob = 1/4 # uniform
def joint_prob(condition):
return sum(prob for (x,y) in outcomes if condition(x, y, x^y))
p_X1 = joint_prob(lambda x,y,z: x==1)
p_Y1 = joint_prob(lambda x,y,z: y==1)
p_Z1 = joint_prob(lambda x,y,z: z==1)
p_X1_Y1 = joint_prob(lambda x,y,z: x==1 and y==1)
p_X1_Z1 = joint_prob(lambda x,y,z: x==1 and z==1)
p_Y1_Z1 = joint_prob(lambda x,y,z: y==1 and z==1)
p_all = joint_prob(lambda x,y,z: x==1 and y==1 and z==1)
print('Pairwise independence checks:')
print(f' P(X=1)*P(Y=1) = {p_X1*p_Y1:.4f}, P(X=1,Y=1) = {p_X1_Y1:.4f} => {abs(p_X1*p_Y1 - p_X1_Y1) < 1e-9}')
print(f' P(X=1)*P(Z=1) = {p_X1*p_Z1:.4f}, P(X=1,Z=1) = {p_X1_Z1:.4f} => {abs(p_X1*p_Z1 - p_X1_Z1) < 1e-9}')
print(f' P(Y=1)*P(Z=1) = {p_Y1*p_Z1:.4f}, P(Y=1,Z=1) = {p_Y1_Z1:.4f} => {abs(p_Y1*p_Z1 - p_Y1_Z1) < 1e-9}')
print()
print('Mutual independence check:')
print(f' P(X=1)*P(Y=1)*P(Z=1) = {p_X1*p_Y1*p_Z1:.4f}')
print(f' P(X=1,Y=1,Z=1) = {p_all:.4f}')
print(f' Mutually independent: {abs(p_X1*p_Y1*p_Z1 - p_all) < 1e-9}')
print('PASS - demonstrated pairwise independence does NOT imply mutual independence')
4. Random Variables, CDF, PDF, PMF
A random variable maps outcomes to real numbers.
The CDF completely characterises :
For discrete : PMF
For continuous : PDF such that
Code cell 13
# === 4.1 CDF Properties and the Universality of the Uniform ===
import numpy as np
from scipy import stats
np.random.seed(42)
# Gaussian CDF
x = np.linspace(-4, 4, 500)
cdf = stats.norm.cdf(x)
pdf = stats.norm.pdf(x)
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [12, 4]
HAS_MPL = True
except ImportError:
HAS_MPL = False
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# PDF
axes[0].plot(x, pdf, 'steelblue', lw=2)
axes[0].fill_between(x, pdf, where=(x >= -1) & (x <= 1), alpha=0.3, color='steelblue')
axes[0].set_title('PDF: $f(x)$')
axes[0].set_xlabel('x'); axes[0].set_ylabel('density')
# CDF
axes[1].plot(x, cdf, 'darkorange', lw=2)
axes[1].axhline(0.5, color='gray', ls='--', alpha=0.5)
axes[1].set_title('CDF: $F(x) = P(X \leq x)$')
axes[1].set_xlabel('x'); axes[1].set_ylabel('probability')
# Universality of Uniform: if U~Uniform(0,1), then F_inv(U)~X
u_samples = np.random.uniform(0, 1, 2000)
x_samples = stats.norm.ppf(u_samples)
axes[2].hist(x_samples, bins=40, density=True, color='green', alpha=0.7, label='inv-CDF samples')
axes[2].plot(x, pdf, 'k-', lw=2, label='true PDF')
axes[2].set_title('Universality of Uniform')
axes[2].set_xlabel('x'); axes[2].legend()
plt.suptitle('Standard Normal: PDF, CDF, and Sampling via Inverse CDF', fontsize=13)
plt.tight_layout()
plt.show()
# Verify properties
print('CDF properties:')
print(f' F(-inf) ≈ {stats.norm.cdf(-10):.6f} (→ 0)')
print(f' F(0) = {stats.norm.cdf(0):.6f} (= 0.5 for symmetric)')
print(f' F(+inf) ≈ {stats.norm.cdf(10):.6f} (→ 1)')
print(f' F is non-decreasing: {all(np.diff(cdf) >= 0)}')
print('PASS - CDF properties verified')
5. Discrete Distributions
5.1 Bernoulli()
5.2 Geometric()
Number of trials until first success.
Memoryless:
Code cell 15
# === 5.1 Bernoulli Distribution ===
import numpy as np
from scipy import stats
np.random.seed(42)
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
HAS_MPL = True
except ImportError:
HAS_MPL = False
# PMF for different p values
p_values = [0.1, 0.3, 0.5, 0.7, 0.9]
print('Bernoulli(p): mean=p, var=p(1-p)')
print(f' {"p":>5} {"E[X]":>8} {"Var(X)":>10} {"H(X) nats":>12}')
for p in p_values:
rv = stats.bernoulli(p)
mean_val = rv.mean()
var_val = rv.var()
entropy = rv.entropy()
print(f' {p:>5.1f} {mean_val:>8.4f} {var_val:>10.4f} {entropy:>12.4f}')
print()
print('Maximum variance at p=0.5 (fair coin), maximum entropy at p=0.5')
print('PASS - Bernoulli computed correctly')
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
p_range = np.linspace(0, 1, 200)
axes[0].plot(p_range, p_range * (1-p_range), 'steelblue', lw=2)
axes[0].set_xlabel('p'); axes[0].set_ylabel('Var(X) = p(1-p)')
axes[0].set_title('Bernoulli Variance')
h_vals = -p_range*np.log(np.where(p_range > 0, p_range, 1)) \
-(1-p_range)*np.log(np.where(1-p_range > 0, 1-p_range, 1))
axes[1].plot(p_range, h_vals, 'darkorange', lw=2)
axes[1].set_xlabel('p'); axes[1].set_ylabel('H(X) nats')
axes[1].set_title('Bernoulli Entropy')
plt.tight_layout()
plt.show()
Code cell 16
# === 5.2 Geometric Distribution — Memoryless Property ===
import numpy as np
from scipy import stats
np.random.seed(42)
p = 0.3 # success probability
rv = stats.geom(p)
k = np.arange(1, 25)
pmf = rv.pmf(k)
print(f'Geometric(p={p}): E[X] = {rv.mean():.4f}, Var = {rv.var():.4f}')
print(f'Theoretical: E[X] = {1/p:.4f}, Var = {(1-p)/p**2:.4f}')
# Verify memoryless property: P(X > 5+3 | X > 5) = P(X > 3)
s, t = 5, 3
p_gt_s = rv.sf(s) # P(X > s)
p_gt_st = rv.sf(s + t) # P(X > s+t)
p_gt_t = rv.sf(t) # P(X > t)
conditional = p_gt_st / p_gt_s # P(X > s+t | X > s)
print(f'\nMemoryless property: P(X > {s+t} | X > {s}) = P(X > {t})?')
print(f' P(X > {s+t} | X > {s}) = {conditional:.6f}')
print(f' P(X > {t}) = {p_gt_t:.6f}')
ok = abs(conditional - p_gt_t) < 1e-10
print(f' PASS: {ok}')
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
HAS_MPL = True
except ImportError:
HAS_MPL = False
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 4))
ax.vlines(k, 0, pmf, colors='steelblue', lw=3, alpha=0.8)
ax.set_xlabel('k (trial of first success)')
ax.set_ylabel('P(X = k)')
ax.set_title(f'Geometric(p={p}) PMF')
plt.tight_layout()
plt.show()
6. Continuous Distributions
6.1 Uniform(, )
for . ,
6.2 Gaussian
,
68-95-99.7 rule: for .
Code cell 18
# === 6.2 Gaussian Distribution — Properties and the 68-95-99.7 Rule ===
import numpy as np
from scipy import stats
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
HAS_MPL = True
except ImportError:
HAS_MPL = False
# 68-95-99.7 rule
print('68-95-99.7 Rule for N(0,1):')
for k in [1, 2, 3]:
prob = stats.norm.cdf(k) - stats.norm.cdf(-k)
print(f' P(|Z| <= {k}) = {prob:.6f} ({100*prob:.2f}%)')
# Location-scale: X = mu + sigma*Z
mu, sigma = 2.0, 1.5
rv = stats.norm(loc=mu, scale=sigma)
print(f'\nN(mu={mu}, sigma^2={sigma**2}):')
print(f' Mean = {rv.mean():.4f} (expected {mu})')
print(f' Variance = {rv.var():.4f} (expected {sigma**2})')
print(f' P(X > mu+sigma) = {rv.sf(mu+sigma):.4f} (expected 0.1587)')
if HAS_MPL:
x = np.linspace(-4, 8, 500)
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
# Multiple Gaussians
configs = [(0,1,'steelblue','N(0,1)'), (2,1.5,'darkorange','N(2,1.5²)'),
(0,2,'green','N(0,4)')]
for mu_,sigma_,col,label in configs:
pdf_ = stats.norm.pdf(x, mu_, sigma_)
axes[0].plot(x, pdf_, color=col, lw=2, label=label)
axes[0].set_title('Gaussian Family'); axes[0].legend()
axes[0].set_xlabel('x'); axes[0].set_ylabel('density')
# 68-95-99.7 shaded regions
x0 = np.linspace(-4, 4, 500)
pdf0 = stats.norm.pdf(x0)
axes[1].plot(x0, pdf0, 'k-', lw=2)
colors = ['#2196F3','#4CAF50','#FF9800']
for i, (k,c) in enumerate(zip([1,2,3],colors)):
mask = np.abs(x0) <= k
axes[1].fill_between(x0, pdf0, where=mask, alpha=0.3, color=c,
label=f'{k}σ: {100*(stats.norm.cdf(k)-stats.norm.cdf(-k)):.1f}%')
axes[1].set_title('68-95-99.7 Rule'); axes[1].legend()
axes[1].set_xlabel('z')
plt.tight_layout(); plt.show()
print('PASS - Gaussian properties verified')
Code cell 19
# === 6.3 Standardisation and Z-scores ===
import numpy as np
from scipy import stats
# If X ~ N(mu, sigma^2), then Z = (X - mu)/sigma ~ N(0, 1)
mu, sigma = 170, 10 # height in cm, approximately
# Questions about height
heights = [180, 155, 170]
print('Height distribution N(170, 10^2):')
for h in heights:
z = (h - mu) / sigma
p_below = stats.norm.cdf(z)
p_above = 1 - p_below
print(f' h={h}: z={z:+.1f}, P(X≤{h})={p_below:.4f}, P(X>{h})={p_above:.4f}')
# Verify standardisation preserves distribution
np.random.seed(42)
samples = np.random.normal(mu, sigma, 10000)
z_scores = (samples - mu) / sigma
print(f'\nStandardised samples: mean={z_scores.mean():.4f}, std={z_scores.std():.4f}')
print(f'Expected: mean=0, std=1')
print(f'PASS: {abs(z_scores.mean()) < 0.05 and abs(z_scores.std()-1) < 0.05}')
7. Expectation and Variance
Linearity: (always, no independence needed)
LOTUS: or — no need to find distribution of .
Code cell 21
# === 7.1 Expectation and Variance from First Principles ===
import numpy as np
from scipy import stats
# Discrete: compute E[X] and Var(X) from PMF
def discrete_moments(values, probs):
"""Compute mean and variance from PMF."""
values = np.array(values, dtype=float)
probs = np.array(probs, dtype=float)
assert abs(probs.sum() - 1) < 1e-10, 'Probabilities must sum to 1'
mean = (values * probs).sum()
var = ((values - mean)**2 * probs).sum()
# Alternative: E[X^2] - (E[X])^2
var2 = (values**2 * probs).sum() - mean**2
return mean, var, var2
# Fair die
values = np.arange(1, 7)
probs = np.ones(6) / 6
mean, var, var2 = discrete_moments(values, probs)
print(f'Fair die: E[X] = {mean:.4f} (expected 3.5), Var = {var:.4f} (expected {35/12:.4f})')
print(f'Alternative formula: Var = {var2:.4f}')
print(f'PASS: formulas agree: {abs(var - var2) < 1e-12}')
# LOTUS: E[X^2] for die
e_x2 = (values**2 * probs).sum()
print(f'\nLOTUS: E[X^2] = {e_x2:.4f} (= mean^2 + var = {mean**2 + var:.4f})')
# Linearity: E[aX + b]
a, b = 3, -2
e_aXb_direct = (a * values + b) @ probs
e_aXb_linearity = a * mean + b
print(f'\nLinearity: E[{a}X + ({b})] = {e_aXb_direct:.4f} (direct) = {e_aXb_linearity:.4f} (linearity)')
print(f'PASS: {abs(e_aXb_direct - e_aXb_linearity) < 1e-12}')
Code cell 22
# === 7.2 Variance Properties ===
import numpy as np
np.random.seed(42)
# Var(aX+b) = a^2 * Var(X)
n = 100000
X = np.random.normal(3, 2, n) # N(3, 4)
a, b = 5, -7
Y = a * X + b
print('Variance properties (simulation):')
print(f' Var(X) = {X.var():.4f} (expected 4.0)')
print(f' Var(aX + b) = {Y.var():.4f} (expected {a**2 * 4:.4f} = a^2*Var(X))')
print(f' a^2 * Var(X) = {a**2 * X.var():.4f}')
print(f' PASS: {abs(Y.var() - a**2 * X.var()) < 0.1}')
# For independent RVs: Var(X+Y) = Var(X) + Var(Y)
X1 = np.random.normal(0, 1, n)
X2 = np.random.normal(0, 2, n)
S = X1 + X2 # sum of independent Gaussians
print(f'\nVar(X1) = {X1.var():.4f} (expected 1.0)')
print(f'Var(X2) = {X2.var():.4f} (expected 4.0)')
print(f'Var(X1+X2) = {S.var():.4f} (expected 5.0 = 1+4)')
print(f'PASS: {abs(S.var() - (X1.var() + X2.var())) < 0.1}')
8. Transformations of Random Variables
8.1 CDF Method
If where is monotone increasing:
8.2 Change-of-Variables (PDF Method)
The Jacobian term corrects for stretching/compression of probability mass.
Code cell 24
# === 8.1 Change of Variables: Y = X^2 for X ~ N(0,1) ===
import numpy as np
from scipy import stats
np.random.seed(42)
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
HAS_MPL = True
except ImportError:
HAS_MPL = False
n = 200000
X = np.random.standard_normal(n)
Y = X**2 # Y ~ Chi-squared(1)
# True PDF of Y = X^2 is chi-squared(1)
y_vals = np.linspace(0.01, 8, 500)
pdf_y_theory = stats.chi2.pdf(y_vals, df=1)
print('Y = X^2 where X ~ N(0,1) => Y ~ Chi-squared(1)')
print(f'Sample mean of Y = {Y.mean():.4f} (expected 1.0 = E[X^2] for N(0,1))')
print(f'Sample var of Y = {Y.var():.4f} (expected 2.0 for chi-sq(1))')
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
axes[0].hist(X, bins=80, density=True, color='steelblue', alpha=0.7, label='X ~ N(0,1)')
xr = np.linspace(-4, 4, 400)
axes[0].plot(xr, stats.norm.pdf(xr), 'k-', lw=2)
axes[0].set_title('X ~ N(0,1)'); axes[0].set_xlabel('x')
axes[1].hist(Y, bins=100, range=(0, 8), density=True,
color='darkorange', alpha=0.7, label='Y = X²')
axes[1].plot(y_vals, pdf_y_theory, 'k-', lw=2, label='χ²(1) PDF')
axes[1].set_title('Y = X² ~ Chi-squared(1)'); axes[1].set_xlabel('y')
axes[1].legend()
plt.tight_layout(); plt.show()
print('PASS - transformation demonstrated')
Code cell 25
# === 8.2 Log-Normal: Y = exp(X) for X ~ N(mu, sigma^2) ===
import numpy as np
from scipy import stats
np.random.seed(42)
mu, sigma = 0.0, 0.5
X = np.random.normal(mu, sigma, 100000)
Y = np.exp(X)
# Theoretical moments of log-normal:
# E[Y] = exp(mu + sigma^2/2)
# Var[Y] = (exp(sigma^2)-1) * exp(2*mu + sigma^2)
e_y_theory = np.exp(mu + sigma**2/2)
var_y_theory = (np.exp(sigma**2)-1) * np.exp(2*mu + sigma**2)
print(f'Log-Normal(mu={mu}, sigma^2={sigma**2}):')
print(f' E[Y] sample={Y.mean():.4f} theory={e_y_theory:.4f}')
print(f' Var[Y] sample={Y.var():.4f} theory={var_y_theory:.4f}')
print(f' PASS E: {abs(Y.mean() - e_y_theory) < 0.01}')
print(f' PASS V: {abs(Y.var() - var_y_theory) < 0.05}')
print()
print('Note: E[exp(X)] = exp(E[X] + Var(X)/2) > exp(E[X]) — Jensen inequality!')
print(f' exp(E[X]) = {np.exp(mu):.4f} vs E[exp(X)] = {e_y_theory:.4f}')
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(10, 4))
ax.hist(Y, bins=100, range=(0, 5), density=True, color='green', alpha=0.7)
y_range = np.linspace(0.01, 5, 300)
ax.plot(y_range, stats.lognorm.pdf(y_range, s=sigma, scale=np.exp(mu)), 'k-', lw=2)
ax.axvline(e_y_theory, color='red', ls='--', label=f'E[Y]={e_y_theory:.3f}')
ax.axvline(np.exp(mu), color='blue', ls='--', label=f'exp(E[X])={np.exp(mu):.3f}')
ax.set_title('Log-Normal Distribution: Y = exp(X)')
ax.set_xlabel('y'); ax.legend()
plt.tight_layout(); plt.show()
except ImportError:
pass
9. ML Applications: Cross-Entropy and Negative Log-Likelihood
Binary cross-entropy loss (logistic regression):
This is the negative log-likelihood under a Bernoulli model: .
Categorical cross-entropy (softmax output):
For one-hot labels:
Code cell 27
# === 9.1 Cross-Entropy Loss from Scratch ===
import numpy as np
np.random.seed(42)
def binary_cross_entropy(y_true, y_pred, eps=1e-15):
"""BCE loss. Clips predictions to avoid log(0)."""
y_pred = np.clip(y_pred, eps, 1 - eps)
return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
# Perfect predictions vs. random
y_true = np.array([1, 1, 0, 0, 1, 0])
y_perfect = np.array([0.99, 0.99, 0.01, 0.01, 0.99, 0.01])
y_random = np.array([0.50, 0.50, 0.50, 0.50, 0.50, 0.50])
y_wrong = np.array([0.01, 0.01, 0.99, 0.99, 0.01, 0.99])
print('Binary Cross-Entropy:')
print(f' Perfect: {binary_cross_entropy(y_true, y_perfect):.4f} (→ 0 for perfect)')
print(f' Random: {binary_cross_entropy(y_true, y_random):.4f} (≈ log(2) = {np.log(2):.4f})')
print(f' All wrong:{binary_cross_entropy(y_true, y_wrong):.4f} (→ ∞ for certain error)')
# Verify: entropy of Bernoulli(0.5) = log(2) ≈ 0.693
from scipy import stats
print(f'\nEntropy of Bernoulli(0.5) = {stats.bernoulli(0.5).entropy():.4f} = log(2) = {np.log(2):.4f}')
print('Random guessing achieves the entropy lower bound (cannot be worse in expectation)')
print('PASS - cross-entropy loss properties verified')
Code cell 28
# === 9.2 Perplexity — LLM Evaluation Metric ===
import numpy as np
# Perplexity = exp(cross-entropy) = exp(NLL)
# For a language model predicting tokens, perplexity measures
# the effective vocabulary size the model is choosing from at each step
def perplexity_from_nll(nll):
"""Perplexity from average negative log-likelihood (nats)."""
return np.exp(nll)
# Simulated NLL values for different model qualities
scenarios = [
('Perfect model (always correct)', -np.log(0.9999)),
('Good LLM (GPT-4 level, ~2.5 NLL)', 2.5),
('Moderate model', 4.0),
('Uniform over 50k vocab', np.log(50000)),
]
print(f' {"Scenario":<45} {"NLL":>8} {"Perplexity":>12}')
print('-' * 68)
for name, nll in scenarios:
ppl = perplexity_from_nll(nll)
print(f' {name:<45} {nll:>8.3f} {ppl:>12.1f}')
print()
print('Interpretation: perplexity = effective branching factor')
print('A perplexity of 10 means the model is equally confused between 10 options')
print('PASS - perplexity computed')
10. Monte Carlo Simulation
Monte Carlo estimates expectations by averaging samples:
Standard error — converges at rate regardless of dimension.
Code cell 30
# === 10.1 Monte Carlo Estimation ===
import numpy as np
np.random.seed(42)
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
HAS_MPL = True
except ImportError:
HAS_MPL = False
# Estimate pi using Monte Carlo
def estimate_pi(n):
x = np.random.uniform(-1, 1, n)
y = np.random.uniform(-1, 1, n)
inside = (x**2 + y**2) <= 1
return 4 * inside.mean()
ns = [100, 1000, 10000, 100000, 1000000]
print('Monte Carlo estimate of π:')
for n in ns:
pi_est = estimate_pi(n)
error = abs(pi_est - np.pi)
print(f' n={n:>8,d}: π̂ = {pi_est:.5f}, error = {error:.5f}, 1/√n = {1/n**0.5:.5f}')
print(f'\nTrue π = {np.pi:.6f}')
print('Error scales as O(1/√n) — decreasing by 10× requires 100× more samples')
if HAS_MPL:
n_plot = 10000
x = np.random.uniform(-1, 1, n_plot)
y = np.random.uniform(-1, 1, n_plot)
inside = (x**2 + y**2) <= 1
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(x[inside], y[inside], s=1, color='steelblue', alpha=0.5)
ax.scatter(x[~inside], y[~inside], s=1, color='red', alpha=0.5)
theta = np.linspace(0, 2*np.pi, 300)
ax.plot(np.cos(theta), np.sin(theta), 'k-', lw=2)
ax.set_aspect('equal')
ax.set_title(f'π ≈ {4*inside.mean():.4f} (n={n_plot:,d})')
plt.tight_layout(); plt.show()
Code cell 31
# === 10.2 Monte Carlo Convergence Rate ===
import numpy as np
np.random.seed(42)
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
HAS_MPL = True
except ImportError:
HAS_MPL = False
# Estimate E[X^2] for X ~ N(0,1) (true value = 1)
true_value = 1.0
n_trials = 50
max_n = 50000
n_values = np.logspace(1, np.log10(max_n), 50, dtype=int)
errors = []
for n in n_values:
trial_errors = []
for _ in range(n_trials):
X = np.random.standard_normal(n)
est = (X**2).mean()
trial_errors.append(abs(est - true_value))
errors.append(np.mean(trial_errors))
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 4))
ax.loglog(n_values, errors, 'steelblue', lw=2, label='Mean |error|')
ax.loglog(n_values, 1/np.sqrt(n_values), 'r--', lw=2, label='O(1/√n) reference')
ax.set_xlabel('Number of samples (n)')
ax.set_ylabel('Mean absolute error')
ax.set_title('Monte Carlo Convergence: E[X²] for X ~ N(0,1)')
ax.legend()
plt.tight_layout(); plt.show()
print(f'At n=100: error ≈ {errors[np.searchsorted(n_values, 100)]:.4f}')
print(f'At n=10000: error ≈ {errors[np.searchsorted(n_values, 10000)]:.5f}')
print('10× more samples → ~3× smaller error (confirms O(1/√n))')
11. Information Theory: Entropy and KL Divergence
Shannon entropy:
KL divergence:
Cross-entropy:
Minimising cross-entropy loss minimising KL divergence maximum likelihood estimation.
Code cell 33
# === 11.1 Entropy of Categorical Distributions ===
import numpy as np
np.random.seed(42)
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
HAS_MPL = True
except ImportError:
HAS_MPL = False
def entropy(probs, base=np.e):
"""Shannon entropy in nats (base=e) or bits (base=2)."""
probs = np.array(probs)
probs = probs[probs > 0] # ignore zero probabilities
return -np.sum(probs * np.log(probs) / np.log(base))
n = 10 # vocabulary size (small example)
distributions = {
'Uniform': np.ones(n)/n,
'Peaked (0.8)': np.array([0.8] + [0.2/(n-1)]*(n-1)),
'Deterministic': np.array([1.0] + [0.0]*(n-1)),
}
print(f'Entropy of {n}-class distributions (bits):')
for name, probs in distributions.items():
h = entropy(probs, base=2)
print(f' {name:<25} H = {h:.4f} bits (max = {np.log2(n):.4f})')
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 3))
for ax, (name, probs) in zip(axes, distributions.items()):
ax.bar(range(n), probs, color='steelblue', alpha=0.8)
h = entropy(probs, base=2)
ax.set_title(f'{name}\nH = {h:.2f} bits')
ax.set_xlabel('class'); ax.set_ylabel('probability')
plt.tight_layout(); plt.show()
print('\nMax entropy = log2(n) achieved by uniform distribution')
print('Min entropy = 0 achieved by deterministic distribution')
print('PASS - entropy properties verified')
Code cell 34
# === 11.2 KL Divergence: Asymmetry and Non-negativity ===
import numpy as np
def kl_divergence(p, q, eps=1e-15):
"""KL(p||q) — note: asymmetric! p is the 'true' distribution."""
p = np.array(p, dtype=float)
q = np.array(q, dtype=float)
# Only include terms where p > 0
mask = p > 0
return np.sum(p[mask] * np.log(p[mask] / (q[mask] + eps)))
# Two distributions
p = np.array([0.6, 0.3, 0.1]) # true
q = np.array([0.1, 0.3, 0.6]) # model (inverted)
u = np.array([1/3, 1/3, 1/3]) # uniform
print('KL divergence examples:')
print(f' KL(p||p) = {kl_divergence(p, p):.6f} (same distribution → 0)')
print(f' KL(p||q) = {kl_divergence(p, q):.4f}')
print(f' KL(q||p) = {kl_divergence(q, p):.4f} (asymmetric!)')
print(f' KL(p||u) = {kl_divergence(p, u):.4f} (distance from uniform)')
# Cross-entropy decomposition
from scipy import stats
p_ = np.array([0.6, 0.3, 0.1])
q_ = np.array([0.4, 0.4, 0.2])
h_p = -np.sum(p_ * np.log(p_)) # entropy of p
kl_pq = kl_divergence(p_, q_) # KL(p||q)
h_pq = -np.sum(p_ * np.log(q_)) # cross-entropy H(p,q)
print(f'\nCross-entropy decomposition H(p,q) = H(p) + KL(p||q):')
print(f' H(p) = {h_p:.4f}')
print(f' KL(p||q) = {kl_pq:.4f}')
print(f' H(p,q) = {h_pq:.4f}')
print(f' H(p)+KL(p||q)= {h_p + kl_pq:.4f}')
ok = abs(h_pq - (h_p + kl_pq)) < 1e-10
print(f' PASS: {ok}')
Summary
| Concept | Key Formula | ML Use |
|---|---|---|
| Probability axioms | , for disjoint | Formal foundation for all probabilistic models |
| Conditional probability | Posterior inference, attention | |
| Bayes' theorem | Bayesian inference, MAP estimation | |
| Independence | i.i.d. assumptions, dropout | |
| CDF | Universality of Uniform, sampling | |
| PDF/PMF | or | Likelihood functions |
| Expectation | Loss functions, gradient estimates | |
| Variance | Regularisation, noise analysis | |
| Cross-entropy | Classification loss | |
| KL divergence | VAE, RLHF, fine-tuning |
Next: §02 Common Distributions — named distributions (Gaussian, Poisson, Beta, Dirichlet) with full moments, MGFs, and ML applications.