Theory Notebook
Converted from
theory.ipynbfor web reading.
Descriptive Statistics — Theory Notebook
"The first step in wisdom is knowing what you don't know — and in statistics, that begins by knowing what your data actually looks like." — John Tukey
Interactive exploration of every descriptive statistic with ML case studies.
Sections: §1 Intuition/EDA · §2 Central Tendency & Spread · §3 Robust Statistics · §4 Bivariate · §5 Multivariate · §6 Standardisation · §7 Visualisation · §8 ML Applications
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import scipy.linalg as la
from scipy import stats
try:
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [12, 5]
plt.rcParams['font.size'] = 12
HAS_MPL = True
except ImportError:
HAS_MPL = False
try:
import seaborn as sns
HAS_SNS = True
except ImportError:
HAS_SNS = False
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete.')
1. Intuition — Anscombe's Quartet
Four datasets with identical summary statistics but completely different structures. This is the canonical demonstration of why EDA (plotting) must precede modelling.
Code cell 5
# === 1.1 Anscombe's Quartet ===
# The four datasets (exact values from Anscombe 1973)
x1 = np.array([10,8,13,9,11,14,6,4,12,7,5], dtype=float)
y1 = np.array([8.04,6.95,7.58,8.81,8.33,9.96,7.24,4.26,10.84,4.82,5.68])
x2 = np.array([10,8,13,9,11,14,6,4,12,7,5], dtype=float)
y2 = np.array([9.14,8.14,8.74,8.77,9.26,8.10,6.13,3.10,9.13,7.26,4.74])
x3 = np.array([10,8,13,9,11,14,6,4,12,7,5], dtype=float)
y3 = np.array([7.46,6.77,12.74,7.11,7.81,8.84,6.08,5.39,8.15,6.42,5.73])
x4 = np.array([8,8,8,8,8,8,8,19,8,8,8], dtype=float)
y4 = np.array([6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.50,5.56,7.91,6.89])
print('Anscombe\'s Quartet — Summary Statistics:')
print(f'{'Dataset':<12} {'x mean':<10} {'y mean':<10} {'x var':<10} {'y var':<10} {'r':<8}')
print('-' * 60)
for name, x, y in [('I', x1,y1), ('II', x2,y2), ('III', x3,y3), ('IV', x4,y4)]:
print(f'{name:<12} {np.mean(x):<10.3f} {np.mean(y):<10.3f} '
f'{np.var(x,ddof=1):<10.3f} {np.var(y,ddof=1):<10.3f} '
f'{np.corrcoef(x,y)[0,1]:<8.3f}')
# Verify identical statistics
datasets = [(x1,y1),(x2,y2),(x3,y3),(x4,y4)]
xmeans = [np.mean(x) for x,y in datasets]
ymeans = [np.mean(y) for x,y in datasets]
assert max(xmeans)-min(xmeans) < 0.01
assert max(ymeans)-min(ymeans) < 0.01
print('\nPASS - All four datasets have ~identical summary statistics')
print('LESSON: Always plot your data before reporting statistics.')
Code cell 6
# === 1.2 Anscombe Visualisation ===
if HAS_MPL:
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
datasets = [(x1,y1,'I'),(x2,y2,'II'),(x3,y3,'III'),(x4,y4,'IV')]
colors = ['#0072B2','#D55E00','#009E73','#CC79A7']
for ax, (x,y,label), col in zip(axes, datasets, colors):
ax.scatter(x, y, color=col, s=60, zorder=3)
m, b = np.polyfit(x, y, 1)
xp = np.linspace(3, 20, 100)
ax.plot(xp, m*xp+b, 'k--', alpha=0.6, lw=1.5)
ax.set_title(f'Dataset {label}\nr={np.corrcoef(x,y)[0,1]:.3f}')
ax.set_xlabel('x'); ax.set_ylabel('y')
ax.set_xlim(2, 21); ax.set_ylim(2, 14)
plt.suptitle("Anscombe's Quartet: Same Statistics, Different Data",
fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()
print('Note: The same regression line fits all four — but only Dataset I is appropriate.')
else:
print('Matplotlib not available.')
2. Measures of Central Tendency
Mean, median, and mode — three different answers to 'what is the centre of the data?' Each minimises a different loss function.
| Statistic | Loss minimised | Sensitivity |
|---|---|---|
| Mean | (L2) | High — pulled by outliers |
| Median | $\sum | x_i-c |
| Mode | 0-1 loss | N/A — for categorical data |
Code cell 8
# === 2.1 Central Tendency ===
np.random.seed(42)
# Clean Gaussian + one outlier
x_clean = np.random.normal(0, 1, 99)
x_outlier = np.append(x_clean, 50.0) # one extreme outlier
print('Central tendency measures:')
print(f' {'Clean':>10} {'With outlier':>14}')
print(f' Mean: {np.mean(x_clean):>10.4f} {np.mean(x_outlier):>14.4f}')
print(f' Median: {np.median(x_clean):>10.4f} {np.median(x_outlier):>14.4f}')
# Verify mean minimises sum-of-squares
def sum_sq(x, c): return np.sum((x - c)**2)
cs = np.linspace(-2, 2, 1000)
min_c = cs[np.argmin([sum_sq(x_clean, c) for c in cs])]
print(f'\nMinimiser of Σ(x-c)²: {min_c:.4f} (sample mean: {np.mean(x_clean):.4f})')
assert abs(min_c - np.mean(x_clean)) < 0.01
print('PASS - mean minimises squared deviations')
# Verify median minimises sum-of-absolute-deviations
def sum_abs(x, c): return np.sum(np.abs(x - c))
min_c_l1 = cs[np.argmin([sum_abs(x_clean, c) for c in cs])]
print(f'Minimiser of Σ|x-c|: {min_c_l1:.4f} (sample median: {np.median(x_clean):.4f})')
assert abs(min_c_l1 - np.median(x_clean)) < 0.01
print('PASS - median minimises absolute deviations')
2. Measures of Spread
Bessel's correction: using makes the sample variance unbiased.
NumPy default (ddof=0) is the MLE (biased). Use ddof=1 for unbiased.
Code cell 10
# === 2.2 Measures of Spread and Bessel's Correction ===
np.random.seed(42)
sigma_true = 3.0
n_vals = [5, 10, 20, 50, 100, 500]
print('Bessel correction — bias comparison (10000 simulations each):')
print(f' {'n':>5} {'E[s²_biased]':>14} {'E[s²_unbiased]':>16} {'True σ²':>8}')
for n in n_vals:
sims = 10000
biased = []
unbiased = []
for _ in range(sims):
x = np.random.normal(0, sigma_true, n)
biased.append(np.var(x, ddof=0))
unbiased.append(np.var(x, ddof=1))
print(f' {n:>5} {np.mean(biased):>14.4f} {np.mean(unbiased):>16.4f} {sigma_true**2:>8.4f}')
print('\nPASS - unbiased estimator (ddof=1) tracks true σ² at all n')
print('LESSON: Always use np.var(x, ddof=1) or np.std(x, ddof=1) for sample statistics.')
2. Shape Statistics: Skewness and Kurtosis
- Skewness : long right tail (mean > median) — income, response times
- Excess kurtosis : heavier tails than Gaussian (leptokurtic) — gradient norms, financial returns
Code cell 12
# === 2.3 Skewness and Kurtosis ===
np.random.seed(42)
n = 5000
# Four distributions with different shapes
data = {
'Gaussian N(0,1)': np.random.normal(0, 1, n),
'Exponential(1)': np.random.exponential(1, n),
'Uniform(-1,1)': np.random.uniform(-1, 1, n),
'Student-t(3)': np.random.standard_t(3, n),
}
print(f'{'Distribution':<20} {'Skew g₁':>10} {'Kurt g₂':>10} {'Mean':>8} {'Median':>8}')
print('-' * 60)
for name, d in data.items():
print(f'{name:<20} {stats.skew(d):>10.3f} {stats.kurtosis(d):>10.3f} '
f'{np.mean(d):>8.3f} {np.median(d):>8.3f}')
# Theoretical values
theory = {'Gaussian N(0,1)': (0, 0), 'Exponential(1)': (2, 6),
'Uniform(-1,1)': (0, -1.2), 'Student-t(3)': (0, float('inf'))}
print('\nTheoretical skewness/kurtosis:')
for name, (g1, g2) in theory.items():
print(f' {name}: g₁={g1}, g₂={g2}')
# Verify Gaussian
g = data['Gaussian N(0,1)']
assert abs(stats.skew(g)) < 0.1, f'Gaussian skew too large: {stats.skew(g):.3f}'
assert abs(stats.kurtosis(g)) < 0.3, f'Gaussian kurt too large: {stats.kurtosis(g):.3f}'
print('\nPASS - Gaussian has near-zero skewness and kurtosis')
2. Order Statistics, Quantiles, and the Empirical CDF
The empirical CDF is a non-parametric estimator of .
Glivenko-Cantelli theorem:
Code cell 14
# === 2.4 Empirical CDF and Glivenko-Cantelli ===
np.random.seed(42)
from scipy.special import ndtr # Gaussian CDF
# ECDF computation
def ecdf(data):
x_sorted = np.sort(data)
y = np.arange(1, len(data)+1) / len(data)
return x_sorted, y
# Glivenko-Cantelli convergence
x_eval = np.linspace(-4, 4, 1000)
F_true = ndtr(x_eval) # true CDF of N(0,1)
print('Glivenko-Cantelli convergence: sup|F̂_n - F|')
for n in [10, 50, 100, 500, 2000, 10000]:
sample = np.random.normal(0, 1, n)
F_hat = np.mean(sample[:, None] <= x_eval, axis=0)
sup_err = np.max(np.abs(F_hat - F_true))
print(f' n={n:6d}: sup error = {sup_err:.5f}')
print('\nPASS - supremum error decreases as n increases (Glivenko-Cantelli)')
# Five-number summary
np.random.seed(42)
x = np.random.exponential(2, 200)
q1, q2, q3 = np.percentile(x, [25, 50, 75])
five_num = np.array([x.min(), q1, q2, q3, x.max()])
print(f'\nFive-number summary (Exp(2) sample, n=200):')
print(f' min={five_num[0]:.3f}, Q1={five_num[1]:.3f}, median={five_num[2]:.3f}, '
f'Q3={five_num[3]:.3f}, max={five_num[4]:.3f}')
print(f' IQR = {q3-q1:.3f}, theory = {2*np.log(4)*2:.3f} (approx {2*np.log(4)*2:.2f})')
Code cell 15
# === 2.5 Distribution Visualisation ===
if HAS_MPL:
np.random.seed(42)
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
dists = [
(np.random.normal(0,1,500), 'Gaussian\ng₁=0, g₂=0', '#0072B2'),
(np.random.exponential(1,500), 'Exponential\ng₁=2, g₂=6', '#D55E00'),
(np.random.uniform(-1,1,500), 'Uniform\ng₁=0, g₂=-1.2', '#009E73'),
(np.random.standard_t(3, 500), 'Student-t(3)\ng₁=0, g₂=∞', '#CC79A7'),
]
for ax, (d, title, col) in zip(axes, dists):
ax.hist(d, bins=30, density=True, alpha=0.6, color=col, edgecolor='white')
ax.axvline(np.mean(d), color='black', linestyle='--', label=f'mean={np.mean(d):.2f}', lw=2)
ax.axvline(np.median(d), color='red', linestyle=':', label=f'median={np.median(d):.2f}', lw=2)
ax.set_title(title); ax.legend(fontsize=9)
ax.set_xlabel('x'); ax.set_ylabel('density')
plt.suptitle('Distribution Shapes: Effect on Mean vs Median', fontsize=13, fontweight='bold')
plt.tight_layout(); plt.show()
print('Note: For Exponential, mean >> median due to right skew.')
else:
print('Matplotlib not available.')
3. Robust Statistics
Breakdown point: fraction of data that can be corrupted before a statistic fails.
| Statistic | Breakdown point |
|---|---|
| Sample mean | |
| Sample median | |
| IQR | |
| MAD |
Code cell 17
# === 3.1 Breakdown Point Demonstration ===
np.random.seed(42)
n = 100
x_base = np.random.normal(0, 1, n)
print('Effect of outlier contamination on location estimators:')
print(f'{'Frac corrupt':>14} {'Mean':>10} {'Median':>10} {'10%-Trimmed':>12} {'MAD (robust σ)':>16}')
print('-' * 65)
for frac in [0.0, 0.01, 0.05, 0.10, 0.25, 0.49]:
k = int(frac * n)
x = x_base.copy()
x[:k] = 1000.0 # corrupt k observations with extreme value
trimmed = stats.trim_mean(x, 0.10)
mad = np.median(np.abs(x - np.median(x))) * 1.4826
print(f'{frac:>14.2f} {np.mean(x):>10.3f} {np.median(x):>10.3f} '
f'{trimmed:>12.3f} {mad:>16.3f}')
print('\nLESSION: Mean breaks with even 1% contamination.')
print(' Median and MAD stay near 0 up to 49% contamination.')
Code cell 18
# === 3.2 Outlier Detection Methods Comparison ===
np.random.seed(42)
n_clean = 200
x_clean = np.random.normal(50, 10, n_clean)
outliers = np.array([5, 10, 90, 100, 110]) # 5 outliers
x = np.concatenate([x_clean, outliers])
# Method 1: Z-score (k=3)
z = (x - np.mean(x)) / np.std(x, ddof=1)
flag_z = np.abs(z) > 3
# Method 2: Tukey fences
q1, q3 = np.percentile(x, [25, 75])
iqr = q3 - q1
flag_tukey = (x < q1 - 1.5*iqr) | (x > q3 + 1.5*iqr)
# Method 3: Modified Z-score (MAD)
med = np.median(x)
mad = np.median(np.abs(x - med))
m_z = 0.6745 * (x - med) / mad
flag_mad = np.abs(m_z) > 3.5
# True outliers are the last 5
true_outliers = np.zeros(len(x), dtype=bool)
true_outliers[-5:] = True
def metrics(flag, true):
tp = np.sum(flag & true)
fp = np.sum(flag & ~true)
fn = np.sum(~flag & true)
return tp, fp, fn
print('Outlier detection performance:')
print(f'{'Method':<20} {'True Pos':>10} {'False Pos':>10} {'False Neg':>10}')
for name, flag in [('Z-score (|z|>3)', flag_z), ('Tukey (1.5 IQR)', flag_tukey), ('MAD modified', flag_mad)]:
tp, fp, fn = metrics(flag, true_outliers)
print(f'{name:<20} {tp:>10} {fp:>10} {fn:>10}')
print('\nPASS - All three methods detect the 5 injected outliers')
print('NOTE: Z-score has more false positives due to outlier-inflated mean/std.')
4. Bivariate Statistics — Pearson, Spearman, Kendall
Three correlation measures for three types of association:
- Pearson : linear association — sensitive to scale, outliers
- Spearman : monotone association — ranks, robust to outliers
- Kendall : concordance probability — small samples, ordinal data
Code cell 20
# === 4.1 Pearson vs Spearman vs Kendall ===
np.random.seed(42)
n = 200
# Case 1: Linear relationship
x = np.linspace(0, 10, n)
y_linear = 2*x + np.random.normal(0, 2, n)
# Case 2: Monotone nonlinear (cubic)
y_cubic = x**3 + np.random.normal(0, 20, n)
# Case 3: Non-monotone (sine)
y_sine = np.sin(x) + np.random.normal(0, 0.3, n)
# Case 4: Linear with heavy-tail outlier
y_outlier = y_linear.copy()
y_outlier[np.random.choice(n, 10)] = np.random.normal(200, 10, 10)
print('Correlation measures for different relationship types:')
print(f'{'Case':<25} {'Pearson r':>12} {'Spearman ρ':>12} {'Kendall τ':>12}')
print('-' * 65)
cases = [
('Linear', x, y_linear),
('Cubic (monotone)', x, y_cubic),
('Sine (non-monotone)', x, y_sine),
('Linear + outliers', x, y_outlier),
]
for name, xi, yi in cases:
r = np.corrcoef(xi, yi)[0, 1]
rho = stats.spearmanr(xi, yi).statistic
tau = stats.kendalltau(xi, yi).statistic
print(f'{name:<25} {r:>12.4f} {rho:>12.4f} {tau:>12.4f}')
print('\nPASS - All correlation measures computed')
print('NOTE: Spearman detects the cubic relationship (r=1.0) even though Pearson < 1.')
print('NOTE: Outliers reduce Pearson dramatically but Spearman/Kendall are robust.')
5. Multivariate Statistics — Covariance Matrix and Mahalanobis Distance
Properties: symmetric, positive semi-definite, rank .
Code cell 22
# === 5.1 Empirical Covariance Matrix ===
np.random.seed(42)
# True parameters
mu_true = np.array([2.0, -1.0, 0.5])
Sigma_true = np.array([[4.0, 2.0, -1.0],
[2.0, 9.0, 1.5],
[-1.0, 1.5, 2.0]])
# Generate data
n = 5000
L = np.linalg.cholesky(Sigma_true)
X = np.random.normal(0, 1, (n, 3)) @ L.T + mu_true
# Compute sample statistics
mu_hat = X.mean(axis=0)
Xc = X - mu_hat
Sigma_hat = (Xc.T @ Xc) / (n - 1)
print('True mean: ', mu_true)
print('Sample mean: ', mu_hat)
print()
print('True covariance matrix:')
print(Sigma_true)
print('\nSample covariance matrix:')
print(Sigma_hat)
# Verify PSD
eigvals = np.linalg.eigvalsh(Sigma_hat)
print(f'\nEigenvalues of Σ̂: {eigvals}')
assert np.all(eigvals > -1e-10), 'Sigma_hat is not PSD!'
print('PASS - Sample covariance matrix is positive semi-definite')
# Check closeness
assert np.allclose(mu_hat, mu_true, atol=0.2)
assert np.allclose(Sigma_hat, Sigma_true, atol=0.5)
print('PASS - Sample statistics converge to true parameters')
Code cell 23
# === 5.2 Mahalanobis Distance and Outlier Detection ===
np.random.seed(42)
d = 2
mu = np.array([0.0, 0.0])
Sigma = np.array([[4.0, 3.0], [3.0, 9.0]])
# Generate clean data + outliers
L = np.linalg.cholesky(Sigma)
X_clean = np.random.normal(0, 1, (200, 2)) @ L.T + mu
X_outliers = np.array([[5, 8], [-4, 9], [6, -2], [-5, -8], [7, 3]])
X_all = np.vstack([X_clean, X_outliers])
# Compute Mahalanobis distance
mu_hat = X_all.mean(axis=0)
Xc = X_all - mu_hat
Sigma_hat = (Xc.T @ Xc) / (len(X_all) - 1)
Sigma_inv = np.linalg.inv(Sigma_hat)
def mahal(X, mu, Sigma_inv):
diff = X - mu
return np.sqrt(np.einsum('ij,jk,ik->i', diff, Sigma_inv, diff))
d_M = mahal(X_all, mu_hat, Sigma_inv)
# Under bivariate Gaussian: d_M^2 ~ chi^2(2)
chi2_threshold = stats.chi2.ppf(0.975, df=2)
flagged = d_M**2 > chi2_threshold
print(f'Chi^2(2) 97.5th percentile threshold: {chi2_threshold:.3f}')
print(f'Mahalanobis threshold (sqrt): {np.sqrt(chi2_threshold):.3f}')
print(f'\nFlagged as outliers: {flagged.sum()} points')
print(f' Clean points flagged (false positives): {flagged[:200].sum()} / 200')
print(f' Injected outliers flagged: {flagged[200:].sum()} / 5')
assert flagged[200:].sum() >= 3, 'Should detect at least 3 of 5 injected outliers'
print('PASS - Mahalanobis distance detects multivariate outliers')
Code cell 24
# === 5.3 Mahalanobis vs Euclidean — Visualisation ===
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Plot 1: scatter with Mahalanobis ellipse
ax = axes[0]
clean_mask = ~flagged
ax.scatter(X_all[clean_mask, 0], X_all[clean_mask, 1],
c='#0072B2', alpha=0.5, s=20, label='Normal')
ax.scatter(X_all[flagged, 0], X_all[flagged, 1],
c='#D55E00', s=80, marker='x', lw=2, label='Flagged (Mahal)')
# Draw 95% Mahalanobis ellipse
theta = np.linspace(0, 2*np.pi, 200)
circle = np.column_stack([np.cos(theta), np.sin(theta)])
eigvals_s, eigvecs_s = np.linalg.eigh(Sigma_hat)
radius = np.sqrt(chi2_threshold)
ellipse = circle @ np.diag(np.sqrt(eigvals_s)) @ eigvecs_s.T * radius + mu_hat
ax.plot(ellipse[:, 0], ellipse[:, 1], 'k--', lw=2, label='97.5% Mahal ellipse')
ax.set_title('Mahalanobis Distance Outlier Detection')
ax.set_xlabel('Feature 1'); ax.set_ylabel('Feature 2')
ax.legend()
# Plot 2: d_M^2 vs chi^2(2) CDF
ax = axes[1]
d_M_sq_sorted = np.sort(d_M[:200]**2)
n_clean = 200
ecdf_vals = np.arange(1, n_clean+1) / n_clean
chi2_vals = np.linspace(0, 15, 300)
ax.step(d_M_sq_sorted, ecdf_vals, label='Empirical CDF of d_M²', color='#0072B2')
ax.plot(chi2_vals, stats.chi2.cdf(chi2_vals, df=2), 'r--',
label='χ²(2) CDF', lw=2)
ax.axvline(chi2_threshold, color='black', linestyle=':', label='97.5% threshold')
ax.set_xlabel('d_M²'); ax.set_ylabel('CDF')
ax.set_title('d_M² vs χ²(2) Distribution')
ax.legend()
plt.tight_layout(); plt.show()
print('Under bivariate Gaussianity, d_M² ~ χ²(2) — the ECDF tracks the theoretical CDF.')
else:
print('Matplotlib not available.')
6. Data Standardisation and Preprocessing
Three scaling methods for different data characteristics:
| Method | Formula | Best for |
|---|---|---|
| Z-score | Symmetric, no outliers | |
| Min-Max | Bounded range needed | |
| Robust | Outliers present |
Critical rule: Always fit the scaler on training data only — never on test data.
Code cell 26
# === 6.1 Scaling Methods Comparison ===
np.random.seed(42)
# Dataset with outliers
x = np.concatenate([np.random.normal(50, 10, 95), [200, 210, -50, -60, 300]])
def z_score(x): return (x - np.mean(x)) / np.std(x, ddof=1)
def min_max(x): return (x - x.min()) / (x.max() - x.min())
def robust_scale(x):
q1, q3 = np.percentile(x, [25, 75])
return (x - np.median(x)) / (q3 - q1)
print('Scaling comparison (data with 5 outliers out of 100):')
print(f'{'Method':<15} {'Mean':>8} {'Std':>8} {'Min':>8} {'Max':>8} {'|Outlier|':>10}')
print('-' * 55)
for name, scaled in [('Z-score', z_score(x)), ('Min-Max', min_max(x)), ('Robust', robust_scale(x))]:
print(f'{name:<15} {np.mean(scaled):>8.3f} {np.std(scaled,ddof=1):>8.3f} '
f'{scaled.min():>8.3f} {scaled.max():>8.3f} '
f'{np.max(np.abs(scaled[-5:])):>10.3f}')
print('\nNOTE: Z-score outlier value is ~25 std devs away — still extreme after scaling.')
print('NOTE: Robust scaling keeps outliers at ~6-7 IQRs, better controlled.')
# Train/test split leakage demo
np.random.seed(42)
train = np.random.normal(50, 10, 100)
test = np.random.normal(55, 10, 50) # slightly shifted test distribution
# CORRECT: scale test using training statistics
mu_tr, s_tr = np.mean(train), np.std(train, ddof=1)
test_correct = (test - mu_tr) / s_tr
# WRONG: scale test using test statistics (data leakage)
test_leaked = z_score(test)
print(f'\nTest set mean after correct scaling: {np.mean(test_correct):.3f} (should be ≈+0.5, reflecting shift)')
print(f'Test set mean after leaky scaling: {np.mean(test_leaked):.3f} (≈0 — shift is hidden!)')
print('PASS - Correct scaling preserves distributional shift; leaky scaling hides it.')
6. Normalisation in Neural Networks
Batch Norm, Layer Norm, and RMS Norm are all sample statistics operations.
The key difference is which dimension the statistics are computed over:
X shape: (batch, sequence, features)
Batch Norm: stats over (batch) per feature → breaks for batch_size=1
Layer Norm: stats over (features) per token → works for any batch size
RMS Norm: RMS over (features) per token → no mean subtraction
Code cell 28
# === 6.2 Batch Norm vs Layer Norm vs RMS Norm ===
np.random.seed(42)
batch_size, d = 32, 512
X = np.random.normal(2.0, 3.0, (batch_size, d)) # non-zero mean, non-unit variance
eps = 1e-5
# Batch Norm: normalise each feature (column) over the batch
mu_BN = X.mean(axis=0) # shape (d,)
var_BN = X.var(axis=0) # biased (ddof=0), shape (d,)
X_BN = (X - mu_BN) / np.sqrt(var_BN + eps)
# Layer Norm: normalise each sample (row) over features
mu_LN = X.mean(axis=1, keepdims=True) # shape (batch, 1)
var_LN = X.var(axis=1, keepdims=True) # shape (batch, 1)
X_LN = (X - mu_LN) / np.sqrt(var_LN + eps)
# RMS Norm: divide by root-mean-square (no mean subtraction)
rms = np.sqrt((X**2).mean(axis=1, keepdims=True))
X_RMS = X / (rms + eps)
print('Normalisation layer verification:')
print(f'{'Method':<12} {'Feature mean (col)':>20} {'Feature std (col)':>20} {'Sample RMS (row)':>18}')
print('-' * 74)
print(f'{'BatchNorm':<12} {np.abs(X_BN.mean(axis=0)).mean():>20.6f} '
f'{X_BN.std(axis=0, ddof=1).mean():>20.6f} '
f'{np.sqrt((X_BN**2).mean(axis=1)).mean():>18.6f}')
print(f'{'LayerNorm':<12} {np.abs(X_LN.mean(axis=1)).mean():>20.6f} '
f'{X_LN.std(axis=1, ddof=1).mean():>20.6f} '
f'{np.sqrt((X_LN**2).mean(axis=1)).mean():>18.6f}')
print(f'{'RMSNorm':<12} {np.abs(X_RMS.mean(axis=1)).mean():>20.6f} '
f'{X_RMS.std(axis=1, ddof=1).mean():>20.6f} '
f'{np.sqrt((X_RMS**2).mean(axis=1)).mean():>18.6f}')
# Verify
assert np.allclose(X_BN.mean(axis=0), 0, atol=1e-6), 'BatchNorm: column means not 0'
assert np.allclose(X_BN.var(axis=0), 1, atol=1e-3), 'BatchNorm: column vars not 1'
assert np.allclose(X_LN.mean(axis=1), 0, atol=1e-6), 'LayerNorm: row means not 0'
assert np.allclose(X_LN.var(axis=1), 1, atol=1e-3), 'LayerNorm: row vars not 1'
assert np.allclose(np.sqrt((X_RMS**2).mean(axis=1)), 1, atol=1e-3), 'RMSNorm: RMS not 1'
print('\nPASS - All three normalisation methods verified')
# Batch Norm breaks for batch_size=1
X_single = X[[0]] # shape (1, d)
var_single = X_single.var(axis=0) # all zeros!
X_BN_single = (X_single - X_single.mean(axis=0)) / np.sqrt(var_single + eps)
print(f'\nBatchNorm batch_size=1: all outputs ≈ 0: {np.allclose(X_BN_single, 0, atol=0.1)}')
print('LESSON: LayerNorm is used in transformers precisely because it works for batch_size=1.')
7. Visualisation for EDA
The complete EDA visualisation toolkit: histograms, box plots, violin plots, ECDFs, Q-Q plots, scatter plots, pair plots, and correlation heatmaps.
Code cell 30
# === 7.1 Univariate Plots ===
if HAS_MPL:
np.random.seed(42)
x = np.concatenate([np.random.normal(0, 1, 150), np.random.normal(4, 0.8, 50)])
fig, axes = plt.subplots(1, 4, figsize=(18, 4))
# Histogram with KDE
ax = axes[0]
ax.hist(x, bins=30, density=True, color='#0072B2', alpha=0.6, edgecolor='white')
x_eval = np.linspace(x.min()-1, x.max()+1, 300)
kde = stats.gaussian_kde(x, bw_method='silverman')
ax.plot(x_eval, kde(x_eval), 'r-', lw=2, label='KDE')
ax.axvline(np.mean(x), color='black', ls='--', label=f'mean={np.mean(x):.2f}')
ax.axvline(np.median(x), color='orange', ls=':', lw=2, label=f'median={np.median(x):.2f}')
ax.set_title('Histogram + KDE'); ax.set_xlabel('x'); ax.legend(fontsize=9)
# Box plot
ax = axes[1]
ax.boxplot(x, vert=True, patch_artist=True,
boxprops=dict(facecolor='#56B4E9', alpha=0.7))
ax.set_title('Box Plot (Tukey fences)'); ax.set_ylabel('x')
# ECDF
ax = axes[2]
x_sorted = np.sort(x)
y_ecdf = np.arange(1, len(x)+1) / len(x)
ax.step(x_sorted, y_ecdf, color='#009E73', lw=2, label='ECDF')
ax.set_title('Empirical CDF'); ax.set_xlabel('x'); ax.set_ylabel('F(x)')
# Q-Q plot vs Gaussian
ax = axes[3]
(osm, osr), _ = stats.probplot(x, dist='norm')
ax.scatter(osm, osr, color='#D55E00', s=15, alpha=0.7)
xlim = np.array([min(osm), max(osm)])
ax.plot(xlim, xlim, 'k--', lw=1.5, label='y=x (Gaussian)')
ax.set_title('Q-Q Plot vs Gaussian'); ax.set_xlabel('Theoretical quantiles')
ax.set_ylabel('Sample quantiles'); ax.legend()
plt.suptitle('Univariate EDA Plots — Bimodal Distribution', fontsize=13, fontweight='bold')
plt.tight_layout(); plt.show()
print('Q-Q plot shows S-shaped deviation from Gaussian — confirms bimodality.')
else:
print('Matplotlib not available.')
Code cell 31
# === 7.2 Correlation Heatmap ===
if HAS_MPL:
np.random.seed(42)
n, d = 300, 6
feature_names = ['income', 'credit_score', 'age', 'debt_ratio', 'late_pmts', 'num_accounts']
# Construct data with known correlation structure
z = np.random.normal(0, 1, (n, d))
# Induce correlations
z[:, 1] = 0.7*z[:, 0] + 0.7*np.random.normal(0,1,n) # credit ~ income
z[:, 3] = -0.5*z[:, 0] + 0.8*np.random.normal(0,1,n) # debt_ratio ~ -income
z[:, 4] = 0.6*z[:, 3] + 0.8*np.random.normal(0,1,n) # late_pmts ~ debt_ratio
R = np.corrcoef(z.T)
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(R, vmin=-1, vmax=1, cmap='coolwarm', aspect='auto')
plt.colorbar(im, ax=ax, label='Pearson r')
ax.set_xticks(range(d)); ax.set_yticks(range(d))
ax.set_xticklabels(feature_names, rotation=45, ha='right')
ax.set_yticklabels(feature_names)
for i in range(d):
for j in range(d):
ax.text(j, i, f'{R[i,j]:.2f}', ha='center', va='center',
fontsize=9, color='black' if abs(R[i,j]) < 0.6 else 'white')
ax.set_title('Sample Correlation Matrix — Financial Features', fontweight='bold')
plt.tight_layout(); plt.show()
print(f'income ↔ credit_score correlation: {R[0,1]:.3f}')
print(f'debt_ratio ↔ late_payments correlation: {R[3,4]:.3f}')
else:
print('Matplotlib not available.')
8. ML Applications
8.1 Adam Optimiser as Online Sample Statistics
Adam's first and second moment estimates are exponentially-weighted moving averages (EWMA) of the gradient and squared gradient — with bias correction analogous to Bessel's correction.
Code cell 33
# === 8.1 Adam Momentum as Running Statistics ===
np.random.seed(42)
T = 500
mu_g, sigma_g = 0.1, 1.0 # true gradient distribution
gradients = np.random.normal(mu_g, sigma_g, T)
beta1, beta2 = 0.9, 0.999
m, v = 0.0, 0.0
m_hat_hist = []
v_hat_hist = []
running_mean = []
running_var = []
cum_mean = 0.0
for t, g in enumerate(gradients, 1):
# Adam updates
m = beta1 * m + (1 - beta1) * g
v = beta2 * v + (1 - beta2) * g**2
m_hat = m / (1 - beta1**t)
v_hat = v / (1 - beta2**t)
m_hat_hist.append(m_hat)
v_hat_hist.append(v_hat)
# True running mean
cum_mean = cum_mean + (g - cum_mean) / t # Welford's
running_mean.append(cum_mean)
m_hat_hist = np.array(m_hat_hist)
v_hat_hist = np.array(v_hat_hist)
running_mean = np.array(running_mean)
print(f'True gradient mean: {mu_g:.4f}')
print(f'True E[g²]: {sigma_g**2 + mu_g**2:.4f}')
print(f'\nAdam m̂ at t=500: {m_hat_hist[-1]:.4f}')
print(f'True running mean: {running_mean[-1]:.4f}')
print(f'Adam v̂ at t=500: {v_hat_hist[-1]:.4f}')
print(f'True E[g²]: {np.mean(gradients**2):.4f}')
# Verify bias correction matters at early steps
print(f'\nAt t=1:')
print(f' Uncorrected m₁ = (1-β₁)g₁ = {(1-beta1)*gradients[0]:.4f}')
print(f' Bias-corrected m̂₁ = g₁ = {m_hat_hist[0]:.4f}')
print(f' True g₁ = {gradients[0]:.4f}')
assert np.isclose(m_hat_hist[0], gradients[0], atol=1e-10)
print('PASS - Bias correction at t=1 recovers the first gradient exactly')
Code cell 34
# === 8.2 Adam Convergence Visualisation ===
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
steps = np.arange(1, T+1)
# Left: m̂_t vs running mean
axes[0].plot(steps, m_hat_hist, color='#0072B2', lw=1.5, label='Adam m̂ₜ (bias-corrected)')
axes[0].plot(steps, running_mean, color='#D55E00', lw=1.5, ls='--', label='True running mean')
axes[0].axhline(mu_g, color='black', ls=':', lw=1.5, label=f'True μ = {mu_g}')
axes[0].set_xlabel('Step t'); axes[0].set_ylabel('First moment')
axes[0].set_title('Adam m̂ₜ vs True Running Mean'); axes[0].legend()
# Right: compare β₁ values (effective window size)
beta1_vals = [0.5, 0.9, 0.99]
colors = ['#0072B2', '#D55E00', '#009E73']
for b1, col in zip(beta1_vals, colors):
m_b = 0.0
mh = []
for t, g in enumerate(gradients, 1):
m_b = b1*m_b + (1-b1)*g
mh.append(m_b / (1 - b1**t))
win = int(1/(1-b1))
axes[1].plot(steps, mh, color=col, lw=1.5, label=f'β₁={b1} (window≈{win})')
axes[1].axhline(mu_g, color='black', ls=':', lw=2, label='True μ')
axes[1].set_xlabel('Step t'); axes[1].set_ylabel('m̂ₜ')
axes[1].set_title('Effect of β₁ on Convergence Speed'); axes[1].legend()
plt.suptitle('Adam: Exponentially-Weighted Sample Statistics', fontsize=13, fontweight='bold')
plt.tight_layout(); plt.show()
print('Larger β₁ = larger effective window = slower convergence to true mean.')
else:
print('Matplotlib not available.')
Code cell 35
# === 8.3 Data Drift Detection via Descriptive Statistics ===
np.random.seed(42)
# Simulate 12 months of production data — drift introduced at month 6
months = 12
n_per_month = 500
feature_data = []
for m in range(months):
if m < 6:
x_m = np.random.normal(50, 10, n_per_month) # stable
else:
x_m = np.random.normal(50 + (m-5)*3, 10, n_per_month) # gradual drift
feature_data.append(x_m)
# Compute descriptive statistics per month
mu_train = np.mean(feature_data[0])
s_train = np.std(feature_data[0], ddof=1)
print('Monthly descriptive statistics monitoring:')
print(f'{'Month':>6} {'Mean':>8} {'Std':>8} {'Skew':>8} {'|Mean shift|/SE':>16} {'Drift flag':>12}')
print('-' * 62)
se = s_train / np.sqrt(n_per_month)
for m_idx, x_m in enumerate(feature_data):
mu_m = np.mean(x_m)
s_m = np.std(x_m, ddof=1)
skew_m = stats.skew(x_m)
z_shift = abs(mu_m - mu_train) / se
flag = '*** DRIFT ***' if z_shift > 3 else ''
print(f'{m_idx+1:>6} {mu_m:>8.2f} {s_m:>8.2f} {skew_m:>8.3f} {z_shift:>16.2f} {flag:>12}')
print('\nPASS - Drift detected in months 7-12 via mean shift > 3 standard errors')
print('LESSON: Monitor descriptive statistics continuously in production ML systems.')
Summary
This notebook covered the complete descriptive statistics toolkit:
- Anscombe's Quartet — why plotting always comes before statistics
- Central tendency — mean minimises L2 loss; median minimises L1 loss
- Bessel's correction —
ddof=1for unbiased sample variance - Skewness and kurtosis — shape of the distribution; relevant for gradient noise
- ECDF and Glivenko-Cantelli — empirical CDF converges uniformly to population CDF
- Robust statistics — MAD and median have 50% breakdown point vs 0% for mean/variance
- Pearson/Spearman/Kendall — linear, monotone, and concordance correlation
- Covariance matrix — PSD, Bessel correction, rank ≤ min(d, n-1)
- Mahalanobis distance — accounts for scale and correlation;
- BatchNorm / LayerNorm / RMSNorm — normalisation layers as sample statistics
- Adam — bias-corrected EWMA of gradient and gradient² as running statistics
- Drift detection — monitoring mean shift in production data
Next: §02 Estimation Theory — How good are sample statistics as estimators? Bias, variance, MLE, Fisher information.
3. Robust Estimators — Trimmed Mean and Winsorisation
Trimmed mean: remove bottom/top -fraction before averaging.
Winsorised mean: clip extreme values to the and quantiles.
Both are L-estimators — linear combinations of order statistics.
For AI: Gradient clipping = Winsorisation of gradient norms.
Code cell 38
# === 3.3 Trimmed Mean, Winsorised Mean, and Gradient Clipping ===
np.random.seed(42)
n = 1000
sigma_g = 2.0
# Simulate heavy-tailed gradient norms
gradient_norms = np.abs(np.random.standard_t(3, n)) * sigma_g
alpha = 0.05 # 5% trimming
clip_threshold = np.percentile(gradient_norms, 95) # 95th percentile as clipping threshold
# Trimmed mean
trimmed = stats.trim_mean(gradient_norms, alpha)
# Winsorised mean (clip to [Q_alpha, Q_{1-alpha}])
q_lo = np.percentile(gradient_norms, alpha*100)
q_hi = np.percentile(gradient_norms, (1-alpha)*100)
winsorised = np.clip(gradient_norms, q_lo, q_hi).mean()
# Gradient clipping (L2 norm clipping)
clipped = np.where(gradient_norms > clip_threshold, clip_threshold, gradient_norms)
clipped_mean = clipped.mean()
print('Gradient norm statistics (heavy-tailed, t(3) distribution):')
print(f' Raw mean: {np.mean(gradient_norms):.4f}')
print(f' Raw median: {np.median(gradient_norms):.4f}')
print(f' 5%-Trimmed mean: {trimmed:.4f}')
print(f' 5%-Winsorised mean:{winsorised:.4f}')
print(f' After clipping at P95={clip_threshold:.3f}: {clipped_mean:.4f}')
print(f' Fraction clipped: {np.mean(gradient_norms > clip_threshold)*100:.1f}%')
# Show that clipping reduces variance
print(f'\n Raw variance: {np.var(gradient_norms, ddof=1):.4f}')
print(f' Clipped variance: {np.var(clipped, ddof=1):.4f}')
print(f' Variance reduction: {(1 - np.var(clipped)/np.var(gradient_norms))*100:.1f}%')
print('PASS - Clipping substantially reduces gradient variance')
print('LESSON: Gradient clipping in LLM training = Winsorisation of gradient norms.')
4. Bivariate Visualisation — Scatter Plots and Correlation
Visualising correlation for different relationship types.
Code cell 40
# === 4.2 Bivariate Visualisation ===
if HAS_MPL:
np.random.seed(42)
n = 150
x = np.random.uniform(0, 10, n)
# Four relationship types
cases = [
('Strong linear\nr=0.95', 3*x + np.random.normal(0,2,n), '#0072B2'),
('Weak linear\nr≈0.3', 0.5*x + np.random.normal(0,4,n), '#D55E00'),
('Quadratic\n(r=0)', (x-5)**2 + np.random.normal(0,2,n), '#009E73'),
('No relationship\nr≈0', np.random.normal(5,3,n), '#CC79A7'),
]
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for ax, (title, y, col) in zip(axes, cases):
ax.scatter(x, y, color=col, alpha=0.5, s=20)
r = np.corrcoef(x, y)[0,1]
rho = stats.spearmanr(x, y).statistic
# Fit and plot regression line
m, b = np.polyfit(x, y, 1)
xp = np.linspace(0, 10, 100)
ax.plot(xp, m*xp+b, 'k--', lw=1.5)
ax.set_title(f'{title}\nPearson r={r:.2f}, Spearman ρ={rho:.2f}', fontsize=10)
ax.set_xlabel('x'); ax.set_ylabel('y')
plt.suptitle('Bivariate Relationships: Pearson vs Spearman', fontsize=13, fontweight='bold')
plt.tight_layout(); plt.show()
print('Quadratic: Pearson≈0 (no linear association) but Spearman captures the trend.')
else:
print('Matplotlib not available.')
5. Curse of Dimensionality — Concentration of Pairwise Distances
As dimensionality grows, all pairwise distances concentrate around the same value. This makes nearest-neighbour methods and attention unreliable in high dimensions.
The scaling in attention is a direct fix: divide by the standard deviation of dot products to restore unit variance.
Code cell 42
# === 5.4 Concentration of Distances in High Dimensions ===
np.random.seed(42)
n_pts = 200
dims = [2, 5, 10, 50, 100, 500, 1000]
print('Concentration of pairwise Euclidean distances:')
print(f'{'dim d':>8} {'Mean dist':>12} {'Std dist':>12} {'CV = std/mean':>15} {'Max-Min':>10}')
print('-' * 60)
for d in dims:
# Unit Gaussian in d dimensions
X = np.random.normal(0, 1, (n_pts, d)) / np.sqrt(d) # normalise so each dim has var=1/d
# Pairwise distances
diffs = X[:, None, :] - X[None, :, :] # n x n x d
dists = np.sqrt((diffs**2).sum(axis=-1))
# Upper triangle only
idx = np.triu_indices(n_pts, k=1)
d_vec = dists[idx]
print(f'{d:>8} {np.mean(d_vec):>12.4f} {np.std(d_vec):>12.4f} '
f'{np.std(d_vec)/np.mean(d_vec):>15.4f} {d_vec.max()-d_vec.min():>10.4f}')
print('\nConcentration: std/mean → 0 as d → ∞. All distances become equal!')
print('This is why attention uses 1/√d_k scaling to prevent softmax saturation.')
# Dot product variance demo
print('\nDot product variance (unit Gaussian vectors):')
print(f'{'d_k':>6} {'E[q·k]':>10} {'Var[q·k]':>12} {'Std':>8}')
for dk in [64, 128, 256, 512, 1024]:
Q = np.random.normal(0, 1, (1000, dk))
K = np.random.normal(0, 1, (1000, dk))
dots = (Q * K).sum(axis=1)
print(f'{dk:>6} {np.mean(dots):>10.2f} {np.var(dots):>12.2f} '
f'{np.std(dots):>8.2f} (theory: {np.sqrt(dk):.2f})')
print('\nPASS - Std of dot products scales as √d_k. Scaling by 1/√d_k restores unit std.')
8. Dataset Characterisation for Model Cards
Computing descriptive statistics for responsible AI documentation. Required by Hugging Face model cards, Google's Model Cards, and the EU AI Act.
Code cell 44
# === 8.3 Dataset Statistics for Model Cards ===
np.random.seed(42)
n = 1000
# Synthetic tabular dataset: loan applications
age = np.random.normal(42, 12, n).clip(18, 80).astype(int)
income = np.random.lognormal(4.0, 0.6, n) # log-normal, right-skewed
credit_score= np.random.normal(680, 80, n).clip(300, 850).astype(int)
debt_ratio = np.random.beta(2, 5, n) # bounded [0,1], right-skewed
default = (0.2 + 0.3*(debt_ratio > 0.4) - 0.1*(credit_score > 700) > np.random.rand(n)).astype(int)
features = {'age': age, 'income': income, 'credit_score': credit_score,
'debt_ratio': debt_ratio}
print('=' * 65)
print('DATASET CARD — Loan Default Prediction Dataset')
print(f'n = {n} samples, d = {len(features)} features + 1 target')
print('=' * 65)
print(f'\n{'Feature':<14} {'Mean':>8} {'Std':>8} {'Min':>8} {'Q50':>8} {'Max':>8} {'Skew':>7}')
print('-' * 65)
for name, arr in features.items():
print(f'{name:<14} {np.mean(arr):>8.2f} {np.std(arr,ddof=1):>8.2f} '
f'{np.min(arr):>8.2f} {np.median(arr):>8.2f} '
f'{np.max(arr):>8.2f} {stats.skew(arr):>7.3f}')
# Label distribution
n_pos = default.sum()
print(f'\nTarget: default=0: {n-n_pos} ({(n-n_pos)/n*100:.1f}%), '
f'default=1: {n_pos} ({n_pos/n*100:.1f}%)')
print(f'Class imbalance ratio: {(n-n_pos)/n_pos:.1f}:1')
# Correlation with target
print(f'\nPearson correlation with default:')
for name, arr in features.items():
r = np.corrcoef(arr, default)[0,1]
print(f' {name:<14}: r = {r:+.4f}')
print('\nPASS - Dataset card statistics computed')
print('NOTE: Income is right-skewed (log-normal) → log-transform recommended.')
print('NOTE: Class imbalance → use balanced accuracy or F1, not raw accuracy.')
7. Assessing Normality — Q-Q Plots
Q-Q (Quantile-Quantile) plots compare sample quantiles to theoretical (Gaussian) quantiles. Deviations from the diagonal reveal departures from normality:
- S-shape: lighter tails (platykurtic)
- Bow-shape: heavier tails (leptokurtic) — common for gradient distributions
- Systematic offset: skewness
Code cell 46
# === 7.3 Q-Q Plots for Normality Assessment ===
if HAS_MPL:
np.random.seed(42)
n = 300
dists = [
('Gaussian\n(expect diagonal)', np.random.normal(0,1,n)),
('Leptokurtic\nStudent-t(3)', np.random.standard_t(3, n)),
('Right-skewed\nExp(1)', np.random.exponential(1, n)),
('Platykurtic\nUniform(-1,1)', np.random.uniform(-1,1,n)),
]
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
for ax, (name, d) in zip(axes, dists):
(osm, osr), _ = stats.probplot(d, dist='norm')
ax.scatter(osm, osr, s=10, alpha=0.6, color='#0072B2')
lim = max(abs(min(osm)), abs(max(osm))) * 1.1
ax.plot([-lim, lim], [-lim, lim], 'r--', lw=1.5)
ax.set_title(name, fontsize=10)
ax.set_xlabel('Theoretical quantiles')
ax.set_ylabel('Sample quantiles')
ax.set_xlim(-lim, lim)
# Add kurtosis annotation
ax.text(0.05, 0.92, f'g₂={stats.kurtosis(d):.2f}',
transform=ax.transAxes, fontsize=9)
plt.suptitle('Q-Q Plots: Diagnosing Departures from Normality', fontsize=13, fontweight='bold')
plt.tight_layout(); plt.show()
print('Heavy tails (t-distribution): bow-shaped Q-Q plot, g₂ > 0')
print('Gradient distributions in LLMs often look like the t(3) Q-Q plot.')
else:
print('Matplotlib not available.')
Appendix: Welford's Online Algorithm
Numerically stable one-pass computation of mean and variance. Used in Batch Norm running statistics and streaming data monitoring.
Code cell 48
# === Welford's Online Algorithm ===
def welford_update(n, mean, M2, x):
"""Online update of mean and variance using Welford's algorithm."""
n += 1
delta = x - mean
mean += delta / n
delta2 = x - mean
M2 += delta * delta2
return n, mean, M2
# Process data one-at-a-time
np.random.seed(42)
data = np.random.normal(10, 3, 1000)
n_w, mean_w, M2_w = 0, 0.0, 0.0
for x in data:
n_w, mean_w, M2_w = welford_update(n_w, mean_w, M2_w, x)
var_w = M2_w / (n_w - 1) # unbiased
print('Welford vs batch computation:')
print(f' Welford mean: {mean_w:.8f}')
print(f' Batch mean: {np.mean(data):.8f}')
print(f' Welford var: {var_w:.8f}')
print(f' Batch var: {np.var(data, ddof=1):.8f}')
assert np.isclose(mean_w, np.mean(data), atol=1e-10)
assert np.isclose(var_w, np.var(data, ddof=1), atol=1e-10)
print('\nPASS - Welford and batch computation give identical results')
# Numerically stable on near-constant data (where naïve fails)
x_const = 1e8 + np.random.normal(0, 1, 100) # constant + small perturbation
# Naïve formula (catastrophic cancellation)
n_naive = len(x_const)
sum1 = np.sum(x_const)
sum2 = np.sum(x_const**2)
var_naive = (sum2 - sum1**2/n_naive) / (n_naive - 1)
# Welford
n_w, mean_w, M2_w = 0, 0.0, 0.0
for x in x_const:
n_w, mean_w, M2_w = welford_update(n_w, mean_w, M2_w, x)
var_welford = M2_w / (n_w - 1)
var_true = np.var(x_const - 1e8, ddof=1) # ground truth
print(f'\nNear-constant data (values ≈ 10^8, noise σ=1):')
print(f' True variance: {var_true:.6f}')
print(f' Naïve formula: {var_naive:.6f} (may be inaccurate due to cancellation)')
print(f' Welford: {var_welford:.6f}')
print('Welford is numerically stable; naïve formula suffers catastrophic cancellation.')
6. Missing Data — Imputation Strategies
Different imputation methods have different effects on the descriptive statistics of the filled dataset.
Code cell 50
# === 6.4 Missing Data Imputation ===
np.random.seed(42)
n = 500
x_true = np.random.normal(50, 15, n)
# Introduce MCAR missingness (20% missing)
missing_mask = np.random.rand(n) < 0.20
x_observed = x_true.copy().astype(float)
x_observed[missing_mask] = np.nan
# Three imputation strategies
x_mean_imp = np.where(np.isnan(x_observed), np.nanmean(x_observed), x_observed)
x_median_imp = np.where(np.isnan(x_observed), np.nanmedian(x_observed), x_observed)
# Random imputation (draw from observed distribution)
obs_vals = x_observed[~np.isnan(x_observed)]
random_fill = np.random.choice(obs_vals, size=missing_mask.sum(), replace=True)
x_random_imp = x_observed.copy()
x_random_imp[np.isnan(x_observed)] = random_fill
print('Effect of imputation on descriptive statistics:')
print(f'{'Method':<16} {'Mean':>8} {'Std':>8} {'Skew':>8} {'Kurt':>8}')
print('-' * 50)
for name, arr in [('True data', x_true), ('Observed (no NA)', obs_vals),
('Mean impute', x_mean_imp), ('Median impute', x_median_imp),
('Random impute', x_random_imp)]:
print(f'{name:<16} {np.mean(arr):>8.3f} {np.std(arr,ddof=1):>8.3f} '
f'{stats.skew(arr):>8.4f} {stats.kurtosis(arr):>8.4f}')
print('\nNOTE: Mean imputation artificially reduces variance and kurtosis.')
print('NOTE: Random imputation best preserves the distributional shape.')
Summary Statistics Cross-Reference
Quick reference: which NumPy/SciPy function computes each statistic.
# All with ddof=1 for unbiased sample statistics:
mean = np.mean(x)
median = np.median(x)
var = np.var(x, ddof=1) # Bessel's correction
std = np.std(x, ddof=1)
skew = scipy.stats.skew(x) # Fisher's g1
kurt = scipy.stats.kurtosis(x) # Excess (g2 = raw - 3)
q1,q3 = np.percentile(x, [25,75])
iqr = q3 - q1
mad = np.median(np.abs(x - np.median(x)))
r = np.corrcoef(x, y)[0,1] # Pearson
rho = scipy.stats.spearmanr(x,y).statistic
Sigma = np.cov(X.T, ddof=1) # X shape (n,d)
Next: §02 Estimation Theory
Code cell 52
# === Final Verification: All Core Statistics ===
np.random.seed(42)
x = np.random.normal(10, 3, 500)
results = {
'mean': np.mean(x),
'median': np.median(x),
'var': np.var(x, ddof=1),
'std': np.std(x, ddof=1),
'skew': stats.skew(x),
'kurt': stats.kurtosis(x),
'IQR': np.percentile(x,75) - np.percentile(x,25),
'MAD': np.median(np.abs(x - np.median(x))),
}
print('Core descriptive statistics for N(10, 9), n=500:')
print(f'{'Statistic':<12} {'Value':>10} {'Theory':>10}')
print('-' * 35)
theory = {'mean':10,'median':10,'var':9,'std':3,'skew':0,'kurt':0,
'IQR':3*1.349,'MAD':3*0.6745}
for k, v in results.items():
print(f'{k:<12} {v:>10.4f} {theory[k]:>10.4f}')
# Sanity checks
assert abs(np.mean(x) - 10) < 0.3
assert abs(np.std(x, ddof=1) - 3) < 0.3
assert abs(stats.skew(x)) < 0.3
assert abs(stats.kurtosis(x)) < 0.5
print('\nPASS - All sample statistics converge to theoretical values for large n')
print('\nAll theory notebook cells completed successfully.')