Theory NotebookMath for LLMs

Descriptive Statistics

Statistics / Descriptive Statistics

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Descriptive Statistics — Theory Notebook

"The first step in wisdom is knowing what you don't know — and in statistics, that begins by knowing what your data actually looks like." — John Tukey

Interactive exploration of every descriptive statistic with ML case studies.

Sections: §1 Intuition/EDA · §2 Central Tendency & Spread · §3 Robust Statistics · §4 Bivariate · §5 Multivariate · §6 Standardisation · §7 Visualisation · §8 ML Applications

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import scipy.linalg as la
from scipy import stats

try:
    import matplotlib.pyplot as plt
    import matplotlib
    plt.style.use('seaborn-v0_8-whitegrid')
    plt.rcParams['figure.figsize'] = [12, 5]
    plt.rcParams['font.size'] = 12
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

try:
    import seaborn as sns
    HAS_SNS = True
except ImportError:
    HAS_SNS = False

np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete.')

1. Intuition — Anscombe's Quartet

Four datasets with identical summary statistics but completely different structures. This is the canonical demonstration of why EDA (plotting) must precede modelling.

Code cell 5

# === 1.1 Anscombe's Quartet ===

# The four datasets (exact values from Anscombe 1973)
x1 = np.array([10,8,13,9,11,14,6,4,12,7,5], dtype=float)
y1 = np.array([8.04,6.95,7.58,8.81,8.33,9.96,7.24,4.26,10.84,4.82,5.68])
x2 = np.array([10,8,13,9,11,14,6,4,12,7,5], dtype=float)
y2 = np.array([9.14,8.14,8.74,8.77,9.26,8.10,6.13,3.10,9.13,7.26,4.74])
x3 = np.array([10,8,13,9,11,14,6,4,12,7,5], dtype=float)
y3 = np.array([7.46,6.77,12.74,7.11,7.81,8.84,6.08,5.39,8.15,6.42,5.73])
x4 = np.array([8,8,8,8,8,8,8,19,8,8,8], dtype=float)
y4 = np.array([6.58,5.76,7.71,8.84,8.47,7.04,5.25,12.50,5.56,7.91,6.89])

print('Anscombe\'s Quartet — Summary Statistics:')
print(f'{'Dataset':<12} {'x mean':<10} {'y mean':<10} {'x var':<10} {'y var':<10} {'r':<8}')
print('-' * 60)
for name, x, y in [('I', x1,y1), ('II', x2,y2), ('III', x3,y3), ('IV', x4,y4)]:
    print(f'{name:<12} {np.mean(x):<10.3f} {np.mean(y):<10.3f} '
          f'{np.var(x,ddof=1):<10.3f} {np.var(y,ddof=1):<10.3f} '
          f'{np.corrcoef(x,y)[0,1]:<8.3f}')

# Verify identical statistics
datasets = [(x1,y1),(x2,y2),(x3,y3),(x4,y4)]
xmeans = [np.mean(x) for x,y in datasets]
ymeans = [np.mean(y) for x,y in datasets]
assert max(xmeans)-min(xmeans) < 0.01
assert max(ymeans)-min(ymeans) < 0.01
print('\nPASS - All four datasets have ~identical summary statistics')
print('LESSON: Always plot your data before reporting statistics.')

Code cell 6

# === 1.2 Anscombe Visualisation ===

if HAS_MPL:
    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    datasets = [(x1,y1,'I'),(x2,y2,'II'),(x3,y3,'III'),(x4,y4,'IV')]
    colors = ['#0072B2','#D55E00','#009E73','#CC79A7']
    for ax, (x,y,label), col in zip(axes, datasets, colors):
        ax.scatter(x, y, color=col, s=60, zorder=3)
        m, b = np.polyfit(x, y, 1)
        xp = np.linspace(3, 20, 100)
        ax.plot(xp, m*xp+b, 'k--', alpha=0.6, lw=1.5)
        ax.set_title(f'Dataset {label}\nr={np.corrcoef(x,y)[0,1]:.3f}')
        ax.set_xlabel('x'); ax.set_ylabel('y')
        ax.set_xlim(2, 21); ax.set_ylim(2, 14)
    plt.suptitle("Anscombe's Quartet: Same Statistics, Different Data",
                 fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.show()
    print('Note: The same regression line fits all four — but only Dataset I is appropriate.')
else:
    print('Matplotlib not available.')

2. Measures of Central Tendency

Mean, median, and mode — three different answers to 'what is the centre of the data?' Each minimises a different loss function.

StatisticLoss minimisedSensitivity
Mean(xic)2\sum(x_i-c)^2 (L2)High — pulled by outliers
Median$\sumx_i-c
Mode0-1 lossN/A — for categorical data

Code cell 8

# === 2.1 Central Tendency ===

np.random.seed(42)
# Clean Gaussian + one outlier
x_clean = np.random.normal(0, 1, 99)
x_outlier = np.append(x_clean, 50.0)  # one extreme outlier

print('Central tendency measures:')
print(f'          {'Clean':>10} {'With outlier':>14}')
print(f'  Mean:   {np.mean(x_clean):>10.4f} {np.mean(x_outlier):>14.4f}')
print(f'  Median: {np.median(x_clean):>10.4f} {np.median(x_outlier):>14.4f}')

# Verify mean minimises sum-of-squares
def sum_sq(x, c): return np.sum((x - c)**2)
cs = np.linspace(-2, 2, 1000)
min_c = cs[np.argmin([sum_sq(x_clean, c) for c in cs])]
print(f'\nMinimiser of Σ(x-c)²: {min_c:.4f} (sample mean: {np.mean(x_clean):.4f})')
assert abs(min_c - np.mean(x_clean)) < 0.01
print('PASS - mean minimises squared deviations')

# Verify median minimises sum-of-absolute-deviations
def sum_abs(x, c): return np.sum(np.abs(x - c))
min_c_l1 = cs[np.argmin([sum_abs(x_clean, c) for c in cs])]
print(f'Minimiser of Σ|x-c|:  {min_c_l1:.4f} (sample median: {np.median(x_clean):.4f})')
assert abs(min_c_l1 - np.median(x_clean)) < 0.01
print('PASS - median minimises absolute deviations')

2. Measures of Spread

Bessel's correction: using n1n-1 makes the sample variance unbiased.

s2=1n1i=1n(xixˉ)2s^2 = \frac{1}{n-1}\sum_{i=1}^n (x_i - \bar{x})^2

NumPy default (ddof=0) is the MLE (biased). Use ddof=1 for unbiased.

Code cell 10

# === 2.2 Measures of Spread and Bessel's Correction ===

np.random.seed(42)
sigma_true = 3.0
n_vals = [5, 10, 20, 50, 100, 500]

print('Bessel correction — bias comparison (10000 simulations each):')
print(f'  {'n':>5}  {'E[s²_biased]':>14}  {'E[s²_unbiased]':>16}  {'True σ²':>8}')
for n in n_vals:
    sims = 10000
    biased = []
    unbiased = []
    for _ in range(sims):
        x = np.random.normal(0, sigma_true, n)
        biased.append(np.var(x, ddof=0))
        unbiased.append(np.var(x, ddof=1))
    print(f'  {n:>5}  {np.mean(biased):>14.4f}  {np.mean(unbiased):>16.4f}  {sigma_true**2:>8.4f}')

print('\nPASS - unbiased estimator (ddof=1) tracks true σ² at all n')
print('LESSON: Always use np.var(x, ddof=1) or np.std(x, ddof=1) for sample statistics.')

2. Shape Statistics: Skewness and Kurtosis

  • Skewness g1>0g_1 > 0: long right tail (mean > median) — income, response times
  • Excess kurtosis g2>0g_2 > 0: heavier tails than Gaussian (leptokurtic) — gradient norms, financial returns

Code cell 12

# === 2.3 Skewness and Kurtosis ===

np.random.seed(42)
n = 5000

# Four distributions with different shapes
data = {
    'Gaussian N(0,1)':    np.random.normal(0, 1, n),
    'Exponential(1)':     np.random.exponential(1, n),
    'Uniform(-1,1)':      np.random.uniform(-1, 1, n),
    'Student-t(3)':       np.random.standard_t(3, n),
}

print(f'{'Distribution':<20} {'Skew g₁':>10} {'Kurt g₂':>10} {'Mean':>8} {'Median':>8}')
print('-' * 60)
for name, d in data.items():
    print(f'{name:<20} {stats.skew(d):>10.3f} {stats.kurtosis(d):>10.3f} '
          f'{np.mean(d):>8.3f} {np.median(d):>8.3f}')

# Theoretical values
theory = {'Gaussian N(0,1)': (0, 0), 'Exponential(1)': (2, 6),
          'Uniform(-1,1)': (0, -1.2), 'Student-t(3)': (0, float('inf'))}
print('\nTheoretical skewness/kurtosis:')
for name, (g1, g2) in theory.items():
    print(f'  {name}: g₁={g1}, g₂={g2}')

# Verify Gaussian
g = data['Gaussian N(0,1)']
assert abs(stats.skew(g)) < 0.1, f'Gaussian skew too large: {stats.skew(g):.3f}'
assert abs(stats.kurtosis(g)) < 0.3, f'Gaussian kurt too large: {stats.kurtosis(g):.3f}'
print('\nPASS - Gaussian has near-zero skewness and kurtosis')

2. Order Statistics, Quantiles, and the Empirical CDF

The empirical CDF F^n(x)=1ni1[xix]\hat{F}_n(x) = \frac{1}{n}\sum_i \mathbf{1}[x_i \leq x] is a non-parametric estimator of F(x)=P(Xx)F(x) = P(X \leq x).

Glivenko-Cantelli theorem: supxF^n(x)F(x)a.s.0\sup_x |\hat{F}_n(x) - F(x)| \overset{a.s.}{\to} 0

Code cell 14

# === 2.4 Empirical CDF and Glivenko-Cantelli ===

np.random.seed(42)
from scipy.special import ndtr  # Gaussian CDF

# ECDF computation
def ecdf(data):
    x_sorted = np.sort(data)
    y = np.arange(1, len(data)+1) / len(data)
    return x_sorted, y

# Glivenko-Cantelli convergence
x_eval = np.linspace(-4, 4, 1000)
F_true = ndtr(x_eval)  # true CDF of N(0,1)

print('Glivenko-Cantelli convergence: sup|F̂_n - F|')
for n in [10, 50, 100, 500, 2000, 10000]:
    sample = np.random.normal(0, 1, n)
    F_hat = np.mean(sample[:, None] <= x_eval, axis=0)
    sup_err = np.max(np.abs(F_hat - F_true))
    print(f'  n={n:6d}: sup error = {sup_err:.5f}')

print('\nPASS - supremum error decreases as n increases (Glivenko-Cantelli)')

# Five-number summary
np.random.seed(42)
x = np.random.exponential(2, 200)
q1, q2, q3 = np.percentile(x, [25, 50, 75])
five_num = np.array([x.min(), q1, q2, q3, x.max()])
print(f'\nFive-number summary (Exp(2) sample, n=200):')
print(f'  min={five_num[0]:.3f}, Q1={five_num[1]:.3f}, median={five_num[2]:.3f}, '
      f'Q3={five_num[3]:.3f}, max={five_num[4]:.3f}')
print(f'  IQR = {q3-q1:.3f}, theory = {2*np.log(4)*2:.3f} (approx {2*np.log(4)*2:.2f})')

Code cell 15

# === 2.5 Distribution Visualisation ===

if HAS_MPL:
    np.random.seed(42)
    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    dists = [
        (np.random.normal(0,1,500), 'Gaussian\ng₁=0, g₂=0', '#0072B2'),
        (np.random.exponential(1,500), 'Exponential\ng₁=2, g₂=6', '#D55E00'),
        (np.random.uniform(-1,1,500), 'Uniform\ng₁=0, g₂=-1.2', '#009E73'),
        (np.random.standard_t(3, 500), 'Student-t(3)\ng₁=0, g₂=∞', '#CC79A7'),
    ]
    for ax, (d, title, col) in zip(axes, dists):
        ax.hist(d, bins=30, density=True, alpha=0.6, color=col, edgecolor='white')
        ax.axvline(np.mean(d), color='black', linestyle='--', label=f'mean={np.mean(d):.2f}', lw=2)
        ax.axvline(np.median(d), color='red', linestyle=':', label=f'median={np.median(d):.2f}', lw=2)
        ax.set_title(title); ax.legend(fontsize=9)
        ax.set_xlabel('x'); ax.set_ylabel('density')
    plt.suptitle('Distribution Shapes: Effect on Mean vs Median', fontsize=13, fontweight='bold')
    plt.tight_layout(); plt.show()
    print('Note: For Exponential, mean >> median due to right skew.')
else:
    print('Matplotlib not available.')

3. Robust Statistics

Breakdown point: fraction of data that can be corrupted before a statistic fails.

StatisticBreakdown point
Sample mean1/n0%1/n \approx 0\%
Sample median(n1)/2/n50%\lfloor(n-1)/2\rfloor/n \to 50\%
IQR25%25\%
MAD50%50\%

Code cell 17

# === 3.1 Breakdown Point Demonstration ===

np.random.seed(42)
n = 100
x_base = np.random.normal(0, 1, n)

print('Effect of outlier contamination on location estimators:')
print(f'{'Frac corrupt':>14} {'Mean':>10} {'Median':>10} {'10%-Trimmed':>12} {'MAD (robust σ)':>16}')
print('-' * 65)

for frac in [0.0, 0.01, 0.05, 0.10, 0.25, 0.49]:
    k = int(frac * n)
    x = x_base.copy()
    x[:k] = 1000.0  # corrupt k observations with extreme value
    trimmed = stats.trim_mean(x, 0.10)
    mad = np.median(np.abs(x - np.median(x))) * 1.4826
    print(f'{frac:>14.2f} {np.mean(x):>10.3f} {np.median(x):>10.3f} '
          f'{trimmed:>12.3f} {mad:>16.3f}')

print('\nLESSION: Mean breaks with even 1% contamination.')
print('         Median and MAD stay near 0 up to 49% contamination.')

Code cell 18

# === 3.2 Outlier Detection Methods Comparison ===

np.random.seed(42)
n_clean = 200
x_clean = np.random.normal(50, 10, n_clean)
outliers = np.array([5, 10, 90, 100, 110])  # 5 outliers
x = np.concatenate([x_clean, outliers])

# Method 1: Z-score (k=3)
z = (x - np.mean(x)) / np.std(x, ddof=1)
flag_z = np.abs(z) > 3

# Method 2: Tukey fences
q1, q3 = np.percentile(x, [25, 75])
iqr = q3 - q1
flag_tukey = (x < q1 - 1.5*iqr) | (x > q3 + 1.5*iqr)

# Method 3: Modified Z-score (MAD)
med = np.median(x)
mad = np.median(np.abs(x - med))
m_z = 0.6745 * (x - med) / mad
flag_mad = np.abs(m_z) > 3.5

# True outliers are the last 5
true_outliers = np.zeros(len(x), dtype=bool)
true_outliers[-5:] = True

def metrics(flag, true):
    tp = np.sum(flag & true)
    fp = np.sum(flag & ~true)
    fn = np.sum(~flag & true)
    return tp, fp, fn

print('Outlier detection performance:')
print(f'{'Method':<20} {'True Pos':>10} {'False Pos':>10} {'False Neg':>10}')
for name, flag in [('Z-score (|z|>3)', flag_z), ('Tukey (1.5 IQR)', flag_tukey), ('MAD modified', flag_mad)]:
    tp, fp, fn = metrics(flag, true_outliers)
    print(f'{name:<20} {tp:>10} {fp:>10} {fn:>10}')

print('\nPASS - All three methods detect the 5 injected outliers')
print('NOTE: Z-score has more false positives due to outlier-inflated mean/std.')

4. Bivariate Statistics — Pearson, Spearman, Kendall

Three correlation measures for three types of association:

  • Pearson rr: linear association — sensitive to scale, outliers
  • Spearman ρS\rho_S: monotone association — ranks, robust to outliers
  • Kendall τ\tau: concordance probability — small samples, ordinal data

Code cell 20

# === 4.1 Pearson vs Spearman vs Kendall ===

np.random.seed(42)
n = 200

# Case 1: Linear relationship
x = np.linspace(0, 10, n)
y_linear = 2*x + np.random.normal(0, 2, n)

# Case 2: Monotone nonlinear (cubic)
y_cubic = x**3 + np.random.normal(0, 20, n)

# Case 3: Non-monotone (sine)
y_sine = np.sin(x) + np.random.normal(0, 0.3, n)

# Case 4: Linear with heavy-tail outlier
y_outlier = y_linear.copy()
y_outlier[np.random.choice(n, 10)] = np.random.normal(200, 10, 10)

print('Correlation measures for different relationship types:')
print(f'{'Case':<25} {'Pearson r':>12} {'Spearman ρ':>12} {'Kendall τ':>12}')
print('-' * 65)
cases = [
    ('Linear', x, y_linear),
    ('Cubic (monotone)', x, y_cubic),
    ('Sine (non-monotone)', x, y_sine),
    ('Linear + outliers', x, y_outlier),
]
for name, xi, yi in cases:
    r   = np.corrcoef(xi, yi)[0, 1]
    rho = stats.spearmanr(xi, yi).statistic
    tau = stats.kendalltau(xi, yi).statistic
    print(f'{name:<25} {r:>12.4f} {rho:>12.4f} {tau:>12.4f}')

print('\nPASS - All correlation measures computed')
print('NOTE: Spearman detects the cubic relationship (r=1.0) even though Pearson < 1.')
print('NOTE: Outliers reduce Pearson dramatically but Spearman/Kendall are robust.')

5. Multivariate Statistics — Covariance Matrix and Mahalanobis Distance

Σ^=1n1XcXcRd×d\hat{\Sigma} = \frac{1}{n-1}\mathbf{X}_c^\top \mathbf{X}_c \in \mathbb{R}^{d \times d}

Properties: symmetric, positive semi-definite, rank min(d,n1)\leq \min(d, n-1).

Code cell 22

# === 5.1 Empirical Covariance Matrix ===

np.random.seed(42)

# True parameters
mu_true = np.array([2.0, -1.0, 0.5])
Sigma_true = np.array([[4.0, 2.0, -1.0],
                        [2.0, 9.0,  1.5],
                        [-1.0, 1.5, 2.0]])

# Generate data
n = 5000
L = np.linalg.cholesky(Sigma_true)
X = np.random.normal(0, 1, (n, 3)) @ L.T + mu_true

# Compute sample statistics
mu_hat = X.mean(axis=0)
Xc = X - mu_hat
Sigma_hat = (Xc.T @ Xc) / (n - 1)

print('True mean:    ', mu_true)
print('Sample mean:  ', mu_hat)
print()
print('True covariance matrix:')
print(Sigma_true)
print('\nSample covariance matrix:')
print(Sigma_hat)

# Verify PSD
eigvals = np.linalg.eigvalsh(Sigma_hat)
print(f'\nEigenvalues of Σ̂: {eigvals}')
assert np.all(eigvals > -1e-10), 'Sigma_hat is not PSD!'
print('PASS - Sample covariance matrix is positive semi-definite')

# Check closeness
assert np.allclose(mu_hat, mu_true, atol=0.2)
assert np.allclose(Sigma_hat, Sigma_true, atol=0.5)
print('PASS - Sample statistics converge to true parameters')

Code cell 23

# === 5.2 Mahalanobis Distance and Outlier Detection ===

np.random.seed(42)
d = 2
mu = np.array([0.0, 0.0])
Sigma = np.array([[4.0, 3.0], [3.0, 9.0]])

# Generate clean data + outliers
L = np.linalg.cholesky(Sigma)
X_clean = np.random.normal(0, 1, (200, 2)) @ L.T + mu
X_outliers = np.array([[5, 8], [-4, 9], [6, -2], [-5, -8], [7, 3]])
X_all = np.vstack([X_clean, X_outliers])

# Compute Mahalanobis distance
mu_hat = X_all.mean(axis=0)
Xc = X_all - mu_hat
Sigma_hat = (Xc.T @ Xc) / (len(X_all) - 1)
Sigma_inv = np.linalg.inv(Sigma_hat)

def mahal(X, mu, Sigma_inv):
    diff = X - mu
    return np.sqrt(np.einsum('ij,jk,ik->i', diff, Sigma_inv, diff))

d_M = mahal(X_all, mu_hat, Sigma_inv)

# Under bivariate Gaussian: d_M^2 ~ chi^2(2)
chi2_threshold = stats.chi2.ppf(0.975, df=2)
flagged = d_M**2 > chi2_threshold

print(f'Chi^2(2) 97.5th percentile threshold: {chi2_threshold:.3f}')
print(f'Mahalanobis threshold (sqrt): {np.sqrt(chi2_threshold):.3f}')
print(f'\nFlagged as outliers: {flagged.sum()} points')
print(f'  Clean points flagged (false positives): {flagged[:200].sum()} / 200')
print(f'  Injected outliers flagged: {flagged[200:].sum()} / 5')

assert flagged[200:].sum() >= 3, 'Should detect at least 3 of 5 injected outliers'
print('PASS - Mahalanobis distance detects multivariate outliers')

Code cell 24

# === 5.3 Mahalanobis vs Euclidean — Visualisation ===

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Plot 1: scatter with Mahalanobis ellipse
    ax = axes[0]
    clean_mask = ~flagged
    ax.scatter(X_all[clean_mask, 0], X_all[clean_mask, 1],
               c='#0072B2', alpha=0.5, s=20, label='Normal')
    ax.scatter(X_all[flagged, 0], X_all[flagged, 1],
               c='#D55E00', s=80, marker='x', lw=2, label='Flagged (Mahal)')

    # Draw 95% Mahalanobis ellipse
    theta = np.linspace(0, 2*np.pi, 200)
    circle = np.column_stack([np.cos(theta), np.sin(theta)])
    eigvals_s, eigvecs_s = np.linalg.eigh(Sigma_hat)
    radius = np.sqrt(chi2_threshold)
    ellipse = circle @ np.diag(np.sqrt(eigvals_s)) @ eigvecs_s.T * radius + mu_hat
    ax.plot(ellipse[:, 0], ellipse[:, 1], 'k--', lw=2, label='97.5% Mahal ellipse')
    ax.set_title('Mahalanobis Distance Outlier Detection')
    ax.set_xlabel('Feature 1'); ax.set_ylabel('Feature 2')
    ax.legend()

    # Plot 2: d_M^2 vs chi^2(2) CDF
    ax = axes[1]
    d_M_sq_sorted = np.sort(d_M[:200]**2)
    n_clean = 200
    ecdf_vals = np.arange(1, n_clean+1) / n_clean
    chi2_vals = np.linspace(0, 15, 300)
    ax.step(d_M_sq_sorted, ecdf_vals, label='Empirical CDF of d_M²', color='#0072B2')
    ax.plot(chi2_vals, stats.chi2.cdf(chi2_vals, df=2), 'r--',
            label='χ²(2) CDF', lw=2)
    ax.axvline(chi2_threshold, color='black', linestyle=':', label='97.5% threshold')
    ax.set_xlabel('d_M²'); ax.set_ylabel('CDF')
    ax.set_title('d_M² vs χ²(2) Distribution')
    ax.legend()

    plt.tight_layout(); plt.show()
    print('Under bivariate Gaussianity, d_M² ~ χ²(2) — the ECDF tracks the theoretical CDF.')
else:
    print('Matplotlib not available.')

6. Data Standardisation and Preprocessing

Three scaling methods for different data characteristics:

MethodFormulaBest for
Z-score(xxˉ)/s(x - \bar{x})/sSymmetric, no outliers
Min-Max(xxmin)/(xmaxxmin)(x - x_{\min})/(x_{\max} - x_{\min})Bounded range needed
Robust(xx~)/IQR(x - \tilde{x})/\text{IQR}Outliers present

Critical rule: Always fit the scaler on training data only — never on test data.

Code cell 26

# === 6.1 Scaling Methods Comparison ===

np.random.seed(42)

# Dataset with outliers
x = np.concatenate([np.random.normal(50, 10, 95), [200, 210, -50, -60, 300]])

def z_score(x): return (x - np.mean(x)) / np.std(x, ddof=1)
def min_max(x): return (x - x.min()) / (x.max() - x.min())
def robust_scale(x):
    q1, q3 = np.percentile(x, [25, 75])
    return (x - np.median(x)) / (q3 - q1)

print('Scaling comparison (data with 5 outliers out of 100):')
print(f'{'Method':<15} {'Mean':>8} {'Std':>8} {'Min':>8} {'Max':>8} {'|Outlier|':>10}')
print('-' * 55)
for name, scaled in [('Z-score', z_score(x)), ('Min-Max', min_max(x)), ('Robust', robust_scale(x))]:
    print(f'{name:<15} {np.mean(scaled):>8.3f} {np.std(scaled,ddof=1):>8.3f} '
          f'{scaled.min():>8.3f} {scaled.max():>8.3f} '
          f'{np.max(np.abs(scaled[-5:])):>10.3f}')

print('\nNOTE: Z-score outlier value is ~25 std devs away — still extreme after scaling.')
print('NOTE: Robust scaling keeps outliers at ~6-7 IQRs, better controlled.')

# Train/test split leakage demo
np.random.seed(42)
train = np.random.normal(50, 10, 100)
test  = np.random.normal(55, 10, 50)   # slightly shifted test distribution

# CORRECT: scale test using training statistics
mu_tr, s_tr = np.mean(train), np.std(train, ddof=1)
test_correct = (test - mu_tr) / s_tr

# WRONG: scale test using test statistics (data leakage)
test_leaked = z_score(test)

print(f'\nTest set mean after correct scaling: {np.mean(test_correct):.3f} (should be ≈+0.5, reflecting shift)')
print(f'Test set mean after leaky scaling:   {np.mean(test_leaked):.3f} (≈0 — shift is hidden!)')
print('PASS - Correct scaling preserves distributional shift; leaky scaling hides it.')

6. Normalisation in Neural Networks

Batch Norm, Layer Norm, and RMS Norm are all sample statistics operations.

The key difference is which dimension the statistics are computed over:

X shape: (batch, sequence, features)
Batch Norm:  stats over (batch) per feature       → breaks for batch_size=1
Layer Norm:  stats over (features) per token      → works for any batch size
RMS Norm:    RMS over (features) per token        → no mean subtraction

Code cell 28

# === 6.2 Batch Norm vs Layer Norm vs RMS Norm ===

np.random.seed(42)
batch_size, d = 32, 512
X = np.random.normal(2.0, 3.0, (batch_size, d))  # non-zero mean, non-unit variance
eps = 1e-5

# Batch Norm: normalise each feature (column) over the batch
mu_BN = X.mean(axis=0)         # shape (d,)
var_BN = X.var(axis=0)         # biased (ddof=0), shape (d,)
X_BN = (X - mu_BN) / np.sqrt(var_BN + eps)

# Layer Norm: normalise each sample (row) over features
mu_LN = X.mean(axis=1, keepdims=True)   # shape (batch, 1)
var_LN = X.var(axis=1, keepdims=True)   # shape (batch, 1)
X_LN = (X - mu_LN) / np.sqrt(var_LN + eps)

# RMS Norm: divide by root-mean-square (no mean subtraction)
rms = np.sqrt((X**2).mean(axis=1, keepdims=True))
X_RMS = X / (rms + eps)

print('Normalisation layer verification:')
print(f'{'Method':<12} {'Feature mean (col)':>20} {'Feature std (col)':>20} {'Sample RMS (row)':>18}')
print('-' * 74)
print(f'{'BatchNorm':<12} {np.abs(X_BN.mean(axis=0)).mean():>20.6f} '
      f'{X_BN.std(axis=0, ddof=1).mean():>20.6f} '
      f'{np.sqrt((X_BN**2).mean(axis=1)).mean():>18.6f}')
print(f'{'LayerNorm':<12} {np.abs(X_LN.mean(axis=1)).mean():>20.6f} '
      f'{X_LN.std(axis=1, ddof=1).mean():>20.6f} '
      f'{np.sqrt((X_LN**2).mean(axis=1)).mean():>18.6f}')
print(f'{'RMSNorm':<12} {np.abs(X_RMS.mean(axis=1)).mean():>20.6f} '
      f'{X_RMS.std(axis=1, ddof=1).mean():>20.6f} '
      f'{np.sqrt((X_RMS**2).mean(axis=1)).mean():>18.6f}')

# Verify
assert np.allclose(X_BN.mean(axis=0), 0, atol=1e-6), 'BatchNorm: column means not 0'
assert np.allclose(X_BN.var(axis=0), 1, atol=1e-3), 'BatchNorm: column vars not 1'
assert np.allclose(X_LN.mean(axis=1), 0, atol=1e-6), 'LayerNorm: row means not 0'
assert np.allclose(X_LN.var(axis=1), 1, atol=1e-3), 'LayerNorm: row vars not 1'
assert np.allclose(np.sqrt((X_RMS**2).mean(axis=1)), 1, atol=1e-3), 'RMSNorm: RMS not 1'
print('\nPASS - All three normalisation methods verified')

# Batch Norm breaks for batch_size=1
X_single = X[[0]]  # shape (1, d)
var_single = X_single.var(axis=0)  # all zeros!
X_BN_single = (X_single - X_single.mean(axis=0)) / np.sqrt(var_single + eps)
print(f'\nBatchNorm batch_size=1: all outputs ≈ 0: {np.allclose(X_BN_single, 0, atol=0.1)}')
print('LESSON: LayerNorm is used in transformers precisely because it works for batch_size=1.')

7. Visualisation for EDA

The complete EDA visualisation toolkit: histograms, box plots, violin plots, ECDFs, Q-Q plots, scatter plots, pair plots, and correlation heatmaps.

Code cell 30

# === 7.1 Univariate Plots ===

if HAS_MPL:
    np.random.seed(42)
    x = np.concatenate([np.random.normal(0, 1, 150), np.random.normal(4, 0.8, 50)])

    fig, axes = plt.subplots(1, 4, figsize=(18, 4))

    # Histogram with KDE
    ax = axes[0]
    ax.hist(x, bins=30, density=True, color='#0072B2', alpha=0.6, edgecolor='white')
    x_eval = np.linspace(x.min()-1, x.max()+1, 300)
    kde = stats.gaussian_kde(x, bw_method='silverman')
    ax.plot(x_eval, kde(x_eval), 'r-', lw=2, label='KDE')
    ax.axvline(np.mean(x), color='black', ls='--', label=f'mean={np.mean(x):.2f}')
    ax.axvline(np.median(x), color='orange', ls=':', lw=2, label=f'median={np.median(x):.2f}')
    ax.set_title('Histogram + KDE'); ax.set_xlabel('x'); ax.legend(fontsize=9)

    # Box plot
    ax = axes[1]
    ax.boxplot(x, vert=True, patch_artist=True,
               boxprops=dict(facecolor='#56B4E9', alpha=0.7))
    ax.set_title('Box Plot (Tukey fences)'); ax.set_ylabel('x')

    # ECDF
    ax = axes[2]
    x_sorted = np.sort(x)
    y_ecdf = np.arange(1, len(x)+1) / len(x)
    ax.step(x_sorted, y_ecdf, color='#009E73', lw=2, label='ECDF')
    ax.set_title('Empirical CDF'); ax.set_xlabel('x'); ax.set_ylabel('F(x)')

    # Q-Q plot vs Gaussian
    ax = axes[3]
    (osm, osr), _ = stats.probplot(x, dist='norm')
    ax.scatter(osm, osr, color='#D55E00', s=15, alpha=0.7)
    xlim = np.array([min(osm), max(osm)])
    ax.plot(xlim, xlim, 'k--', lw=1.5, label='y=x (Gaussian)')
    ax.set_title('Q-Q Plot vs Gaussian'); ax.set_xlabel('Theoretical quantiles')
    ax.set_ylabel('Sample quantiles'); ax.legend()

    plt.suptitle('Univariate EDA Plots — Bimodal Distribution', fontsize=13, fontweight='bold')
    plt.tight_layout(); plt.show()
    print('Q-Q plot shows S-shaped deviation from Gaussian — confirms bimodality.')
else:
    print('Matplotlib not available.')

Code cell 31

# === 7.2 Correlation Heatmap ===

if HAS_MPL:
    np.random.seed(42)
    n, d = 300, 6
    feature_names = ['income', 'credit_score', 'age', 'debt_ratio', 'late_pmts', 'num_accounts']

    # Construct data with known correlation structure
    z = np.random.normal(0, 1, (n, d))
    # Induce correlations
    z[:, 1] = 0.7*z[:, 0] + 0.7*np.random.normal(0,1,n)   # credit ~ income
    z[:, 3] = -0.5*z[:, 0] + 0.8*np.random.normal(0,1,n)  # debt_ratio ~ -income
    z[:, 4] = 0.6*z[:, 3] + 0.8*np.random.normal(0,1,n)   # late_pmts ~ debt_ratio

    R = np.corrcoef(z.T)

    fig, ax = plt.subplots(figsize=(7, 6))
    im = ax.imshow(R, vmin=-1, vmax=1, cmap='coolwarm', aspect='auto')
    plt.colorbar(im, ax=ax, label='Pearson r')
    ax.set_xticks(range(d)); ax.set_yticks(range(d))
    ax.set_xticklabels(feature_names, rotation=45, ha='right')
    ax.set_yticklabels(feature_names)
    for i in range(d):
        for j in range(d):
            ax.text(j, i, f'{R[i,j]:.2f}', ha='center', va='center',
                    fontsize=9, color='black' if abs(R[i,j]) < 0.6 else 'white')
    ax.set_title('Sample Correlation Matrix — Financial Features', fontweight='bold')
    plt.tight_layout(); plt.show()
    print(f'income ↔ credit_score correlation: {R[0,1]:.3f}')
    print(f'debt_ratio ↔ late_payments correlation: {R[3,4]:.3f}')
else:
    print('Matplotlib not available.')

8. ML Applications

8.1 Adam Optimiser as Online Sample Statistics

Adam's first and second moment estimates are exponentially-weighted moving averages (EWMA) of the gradient and squared gradient — with bias correction analogous to Bessel's correction.

Code cell 33

# === 8.1 Adam Momentum as Running Statistics ===

np.random.seed(42)
T = 500
mu_g, sigma_g = 0.1, 1.0  # true gradient distribution
gradients = np.random.normal(mu_g, sigma_g, T)

beta1, beta2 = 0.9, 0.999
m, v = 0.0, 0.0

m_hat_hist = []
v_hat_hist = []
running_mean = []
running_var = []

cum_mean = 0.0
for t, g in enumerate(gradients, 1):
    # Adam updates
    m = beta1 * m + (1 - beta1) * g
    v = beta2 * v + (1 - beta2) * g**2
    m_hat = m / (1 - beta1**t)
    v_hat = v / (1 - beta2**t)
    m_hat_hist.append(m_hat)
    v_hat_hist.append(v_hat)
    # True running mean
    cum_mean = cum_mean + (g - cum_mean) / t  # Welford's
    running_mean.append(cum_mean)

m_hat_hist = np.array(m_hat_hist)
v_hat_hist = np.array(v_hat_hist)
running_mean = np.array(running_mean)

print(f'True gradient mean:    {mu_g:.4f}')
print(f'True E[g²]:            {sigma_g**2 + mu_g**2:.4f}')
print(f'\nAdam m̂ at t=500:       {m_hat_hist[-1]:.4f}')
print(f'True running mean:      {running_mean[-1]:.4f}')
print(f'Adam v̂ at t=500:       {v_hat_hist[-1]:.4f}')
print(f'True E[g²]:            {np.mean(gradients**2):.4f}')

# Verify bias correction matters at early steps
print(f'\nAt t=1:')
print(f'  Uncorrected m₁ = (1-β₁)g₁ = {(1-beta1)*gradients[0]:.4f}')
print(f'  Bias-corrected m̂₁ = g₁     = {m_hat_hist[0]:.4f}')
print(f'  True g₁                     = {gradients[0]:.4f}')

assert np.isclose(m_hat_hist[0], gradients[0], atol=1e-10)
print('PASS - Bias correction at t=1 recovers the first gradient exactly')

Code cell 34

# === 8.2 Adam Convergence Visualisation ===

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    steps = np.arange(1, T+1)

    # Left: m̂_t vs running mean
    axes[0].plot(steps, m_hat_hist, color='#0072B2', lw=1.5, label='Adam m̂ₜ (bias-corrected)')
    axes[0].plot(steps, running_mean, color='#D55E00', lw=1.5, ls='--', label='True running mean')
    axes[0].axhline(mu_g, color='black', ls=':', lw=1.5, label=f'True μ = {mu_g}')
    axes[0].set_xlabel('Step t'); axes[0].set_ylabel('First moment')
    axes[0].set_title('Adam m̂ₜ vs True Running Mean'); axes[0].legend()

    # Right: compare β₁ values (effective window size)
    beta1_vals = [0.5, 0.9, 0.99]
    colors = ['#0072B2', '#D55E00', '#009E73']
    for b1, col in zip(beta1_vals, colors):
        m_b = 0.0
        mh = []
        for t, g in enumerate(gradients, 1):
            m_b = b1*m_b + (1-b1)*g
            mh.append(m_b / (1 - b1**t))
        win = int(1/(1-b1))
        axes[1].plot(steps, mh, color=col, lw=1.5, label=f'β₁={b1} (window≈{win})')
    axes[1].axhline(mu_g, color='black', ls=':', lw=2, label='True μ')
    axes[1].set_xlabel('Step t'); axes[1].set_ylabel('m̂ₜ')
    axes[1].set_title('Effect of β₁ on Convergence Speed'); axes[1].legend()

    plt.suptitle('Adam: Exponentially-Weighted Sample Statistics', fontsize=13, fontweight='bold')
    plt.tight_layout(); plt.show()
    print('Larger β₁ = larger effective window = slower convergence to true mean.')
else:
    print('Matplotlib not available.')

Code cell 35

# === 8.3 Data Drift Detection via Descriptive Statistics ===

np.random.seed(42)

# Simulate 12 months of production data — drift introduced at month 6
months = 12
n_per_month = 500
feature_data = []

for m in range(months):
    if m < 6:
        x_m = np.random.normal(50, 10, n_per_month)  # stable
    else:
        x_m = np.random.normal(50 + (m-5)*3, 10, n_per_month)  # gradual drift
    feature_data.append(x_m)

# Compute descriptive statistics per month
mu_train = np.mean(feature_data[0])
s_train  = np.std(feature_data[0], ddof=1)

print('Monthly descriptive statistics monitoring:')
print(f'{'Month':>6} {'Mean':>8} {'Std':>8} {'Skew':>8} {'|Mean shift|/SE':>16} {'Drift flag':>12}')
print('-' * 62)

se = s_train / np.sqrt(n_per_month)
for m_idx, x_m in enumerate(feature_data):
    mu_m = np.mean(x_m)
    s_m = np.std(x_m, ddof=1)
    skew_m = stats.skew(x_m)
    z_shift = abs(mu_m - mu_train) / se
    flag = '*** DRIFT ***' if z_shift > 3 else ''
    print(f'{m_idx+1:>6} {mu_m:>8.2f} {s_m:>8.2f} {skew_m:>8.3f} {z_shift:>16.2f} {flag:>12}')

print('\nPASS - Drift detected in months 7-12 via mean shift > 3 standard errors')
print('LESSON: Monitor descriptive statistics continuously in production ML systems.')

Summary

This notebook covered the complete descriptive statistics toolkit:

  1. Anscombe's Quartet — why plotting always comes before statistics
  2. Central tendency — mean minimises L2 loss; median minimises L1 loss
  3. Bessel's correctionddof=1 for unbiased sample variance
  4. Skewness and kurtosis — shape of the distribution; relevant for gradient noise
  5. ECDF and Glivenko-Cantelli — empirical CDF converges uniformly to population CDF
  6. Robust statistics — MAD and median have 50% breakdown point vs 0% for mean/variance
  7. Pearson/Spearman/Kendall — linear, monotone, and concordance correlation
  8. Covariance matrix — PSD, Bessel correction, rank ≤ min(d, n-1)
  9. Mahalanobis distance — accounts for scale and correlation; dM2χ2(d)d_M^2 \sim \chi^2(d)
  10. BatchNorm / LayerNorm / RMSNorm — normalisation layers as sample statistics
  11. Adam — bias-corrected EWMA of gradient and gradient² as running statistics
  12. Drift detection — monitoring mean shift in production data

Next: §02 Estimation Theory — How good are sample statistics as estimators? Bias, variance, MLE, Fisher information.


3. Robust Estimators — Trimmed Mean and Winsorisation

Trimmed mean: remove bottom/top α\alpha-fraction before averaging.

Winsorised mean: clip extreme values to the α\alpha and 1α1-\alpha quantiles.

Both are L-estimators — linear combinations of order statistics.

For AI: Gradient clipping = Winsorisation of gradient norms.

Code cell 38

# === 3.3 Trimmed Mean, Winsorised Mean, and Gradient Clipping ===

np.random.seed(42)
n = 1000
sigma_g = 2.0
# Simulate heavy-tailed gradient norms
gradient_norms = np.abs(np.random.standard_t(3, n)) * sigma_g

alpha = 0.05  # 5% trimming
clip_threshold = np.percentile(gradient_norms, 95)  # 95th percentile as clipping threshold

# Trimmed mean
trimmed = stats.trim_mean(gradient_norms, alpha)

# Winsorised mean (clip to [Q_alpha, Q_{1-alpha}])
q_lo = np.percentile(gradient_norms, alpha*100)
q_hi = np.percentile(gradient_norms, (1-alpha)*100)
winsorised = np.clip(gradient_norms, q_lo, q_hi).mean()

# Gradient clipping (L2 norm clipping)
clipped = np.where(gradient_norms > clip_threshold, clip_threshold, gradient_norms)
clipped_mean = clipped.mean()

print('Gradient norm statistics (heavy-tailed, t(3) distribution):')
print(f'  Raw mean:          {np.mean(gradient_norms):.4f}')
print(f'  Raw median:        {np.median(gradient_norms):.4f}')
print(f'  5%-Trimmed mean:   {trimmed:.4f}')
print(f'  5%-Winsorised mean:{winsorised:.4f}')
print(f'  After clipping at P95={clip_threshold:.3f}: {clipped_mean:.4f}')
print(f'  Fraction clipped:  {np.mean(gradient_norms > clip_threshold)*100:.1f}%')

# Show that clipping reduces variance
print(f'\n  Raw variance:     {np.var(gradient_norms, ddof=1):.4f}')
print(f'  Clipped variance: {np.var(clipped, ddof=1):.4f}')
print(f'  Variance reduction: {(1 - np.var(clipped)/np.var(gradient_norms))*100:.1f}%')
print('PASS - Clipping substantially reduces gradient variance')
print('LESSON: Gradient clipping in LLM training = Winsorisation of gradient norms.')

4. Bivariate Visualisation — Scatter Plots and Correlation

Visualising correlation for different relationship types.

Code cell 40

# === 4.2 Bivariate Visualisation ===

if HAS_MPL:
    np.random.seed(42)
    n = 150
    x = np.random.uniform(0, 10, n)

    # Four relationship types
    cases = [
        ('Strong linear\nr=0.95', 3*x + np.random.normal(0,2,n), '#0072B2'),
        ('Weak linear\nr≈0.3', 0.5*x + np.random.normal(0,4,n), '#D55E00'),
        ('Quadratic\n(r=0)', (x-5)**2 + np.random.normal(0,2,n), '#009E73'),
        ('No relationship\nr≈0', np.random.normal(5,3,n), '#CC79A7'),
    ]

    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    for ax, (title, y, col) in zip(axes, cases):
        ax.scatter(x, y, color=col, alpha=0.5, s=20)
        r = np.corrcoef(x, y)[0,1]
        rho = stats.spearmanr(x, y).statistic
        # Fit and plot regression line
        m, b = np.polyfit(x, y, 1)
        xp = np.linspace(0, 10, 100)
        ax.plot(xp, m*xp+b, 'k--', lw=1.5)
        ax.set_title(f'{title}\nPearson r={r:.2f}, Spearman ρ={rho:.2f}', fontsize=10)
        ax.set_xlabel('x'); ax.set_ylabel('y')
    plt.suptitle('Bivariate Relationships: Pearson vs Spearman', fontsize=13, fontweight='bold')
    plt.tight_layout(); plt.show()
    print('Quadratic: Pearson≈0 (no linear association) but Spearman captures the trend.')
else:
    print('Matplotlib not available.')

5. Curse of Dimensionality — Concentration of Pairwise Distances

As dimensionality dd grows, all pairwise distances concentrate around the same value. This makes nearest-neighbour methods and attention unreliable in high dimensions.

The 1/dk1/\sqrt{d_k} scaling in attention is a direct fix: divide by the standard deviation of dot products to restore unit variance.

Code cell 42

# === 5.4 Concentration of Distances in High Dimensions ===

np.random.seed(42)
n_pts = 200
dims = [2, 5, 10, 50, 100, 500, 1000]

print('Concentration of pairwise Euclidean distances:')
print(f'{'dim d':>8} {'Mean dist':>12} {'Std dist':>12} {'CV = std/mean':>15} {'Max-Min':>10}')
print('-' * 60)

for d in dims:
    # Unit Gaussian in d dimensions
    X = np.random.normal(0, 1, (n_pts, d)) / np.sqrt(d)  # normalise so each dim has var=1/d
    # Pairwise distances
    diffs = X[:, None, :] - X[None, :, :]  # n x n x d
    dists = np.sqrt((diffs**2).sum(axis=-1))
    # Upper triangle only
    idx = np.triu_indices(n_pts, k=1)
    d_vec = dists[idx]
    print(f'{d:>8} {np.mean(d_vec):>12.4f} {np.std(d_vec):>12.4f} '
          f'{np.std(d_vec)/np.mean(d_vec):>15.4f} {d_vec.max()-d_vec.min():>10.4f}')

print('\nConcentration: std/mean → 0 as d → ∞. All distances become equal!')
print('This is why attention uses 1/√d_k scaling to prevent softmax saturation.')

# Dot product variance demo
print('\nDot product variance (unit Gaussian vectors):')
print(f'{'d_k':>6} {'E[q·k]':>10} {'Var[q·k]':>12} {'Std':>8}')
for dk in [64, 128, 256, 512, 1024]:
    Q = np.random.normal(0, 1, (1000, dk))
    K = np.random.normal(0, 1, (1000, dk))
    dots = (Q * K).sum(axis=1)
    print(f'{dk:>6} {np.mean(dots):>10.2f} {np.var(dots):>12.2f} '
          f'{np.std(dots):>8.2f} (theory: {np.sqrt(dk):.2f})')

print('\nPASS - Std of dot products scales as √d_k. Scaling by 1/√d_k restores unit std.')

8. Dataset Characterisation for Model Cards

Computing descriptive statistics for responsible AI documentation. Required by Hugging Face model cards, Google's Model Cards, and the EU AI Act.

Code cell 44

# === 8.3 Dataset Statistics for Model Cards ===

np.random.seed(42)
n = 1000

# Synthetic tabular dataset: loan applications
age         = np.random.normal(42, 12, n).clip(18, 80).astype(int)
income      = np.random.lognormal(4.0, 0.6, n)  # log-normal, right-skewed
credit_score= np.random.normal(680, 80, n).clip(300, 850).astype(int)
debt_ratio  = np.random.beta(2, 5, n)  # bounded [0,1], right-skewed
default     = (0.2 + 0.3*(debt_ratio > 0.4) - 0.1*(credit_score > 700) > np.random.rand(n)).astype(int)

features = {'age': age, 'income': income, 'credit_score': credit_score,
            'debt_ratio': debt_ratio}

print('=' * 65)
print('DATASET CARD — Loan Default Prediction Dataset')
print(f'n = {n} samples, d = {len(features)} features + 1 target')
print('=' * 65)

print(f'\n{'Feature':<14} {'Mean':>8} {'Std':>8} {'Min':>8} {'Q50':>8} {'Max':>8} {'Skew':>7}')
print('-' * 65)
for name, arr in features.items():
    print(f'{name:<14} {np.mean(arr):>8.2f} {np.std(arr,ddof=1):>8.2f} '
          f'{np.min(arr):>8.2f} {np.median(arr):>8.2f} '
          f'{np.max(arr):>8.2f} {stats.skew(arr):>7.3f}')

# Label distribution
n_pos = default.sum()
print(f'\nTarget: default=0: {n-n_pos} ({(n-n_pos)/n*100:.1f}%), '
      f'default=1: {n_pos} ({n_pos/n*100:.1f}%)')
print(f'Class imbalance ratio: {(n-n_pos)/n_pos:.1f}:1')

# Correlation with target
print(f'\nPearson correlation with default:')
for name, arr in features.items():
    r = np.corrcoef(arr, default)[0,1]
    print(f'  {name:<14}: r = {r:+.4f}')

print('\nPASS - Dataset card statistics computed')
print('NOTE: Income is right-skewed (log-normal) → log-transform recommended.')
print('NOTE: Class imbalance → use balanced accuracy or F1, not raw accuracy.')

7. Assessing Normality — Q-Q Plots

Q-Q (Quantile-Quantile) plots compare sample quantiles to theoretical (Gaussian) quantiles. Deviations from the diagonal y=xy = x reveal departures from normality:

  • S-shape: lighter tails (platykurtic)
  • Bow-shape: heavier tails (leptokurtic) — common for gradient distributions
  • Systematic offset: skewness

Code cell 46

# === 7.3 Q-Q Plots for Normality Assessment ===

if HAS_MPL:
    np.random.seed(42)
    n = 300

    dists = [
        ('Gaussian\n(expect diagonal)', np.random.normal(0,1,n)),
        ('Leptokurtic\nStudent-t(3)', np.random.standard_t(3, n)),
        ('Right-skewed\nExp(1)', np.random.exponential(1, n)),
        ('Platykurtic\nUniform(-1,1)', np.random.uniform(-1,1,n)),
    ]

    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    for ax, (name, d) in zip(axes, dists):
        (osm, osr), _ = stats.probplot(d, dist='norm')
        ax.scatter(osm, osr, s=10, alpha=0.6, color='#0072B2')
        lim = max(abs(min(osm)), abs(max(osm))) * 1.1
        ax.plot([-lim, lim], [-lim, lim], 'r--', lw=1.5)
        ax.set_title(name, fontsize=10)
        ax.set_xlabel('Theoretical quantiles')
        ax.set_ylabel('Sample quantiles')
        ax.set_xlim(-lim, lim)
        # Add kurtosis annotation
        ax.text(0.05, 0.92, f'g₂={stats.kurtosis(d):.2f}',
                transform=ax.transAxes, fontsize=9)

    plt.suptitle('Q-Q Plots: Diagnosing Departures from Normality', fontsize=13, fontweight='bold')
    plt.tight_layout(); plt.show()
    print('Heavy tails (t-distribution): bow-shaped Q-Q plot, g₂ > 0')
    print('Gradient distributions in LLMs often look like the t(3) Q-Q plot.')
else:
    print('Matplotlib not available.')

Appendix: Welford's Online Algorithm

Numerically stable one-pass computation of mean and variance. Used in Batch Norm running statistics and streaming data monitoring.

Code cell 48

# === Welford's Online Algorithm ===

def welford_update(n, mean, M2, x):
    """Online update of mean and variance using Welford's algorithm."""
    n += 1
    delta = x - mean
    mean += delta / n
    delta2 = x - mean
    M2 += delta * delta2
    return n, mean, M2

# Process data one-at-a-time
np.random.seed(42)
data = np.random.normal(10, 3, 1000)

n_w, mean_w, M2_w = 0, 0.0, 0.0
for x in data:
    n_w, mean_w, M2_w = welford_update(n_w, mean_w, M2_w, x)

var_w = M2_w / (n_w - 1)  # unbiased

print('Welford vs batch computation:')
print(f'  Welford mean:   {mean_w:.8f}')
print(f'  Batch mean:     {np.mean(data):.8f}')
print(f'  Welford var:    {var_w:.8f}')
print(f'  Batch var:      {np.var(data, ddof=1):.8f}')

assert np.isclose(mean_w, np.mean(data), atol=1e-10)
assert np.isclose(var_w, np.var(data, ddof=1), atol=1e-10)
print('\nPASS - Welford and batch computation give identical results')

# Numerically stable on near-constant data (where naïve fails)
x_const = 1e8 + np.random.normal(0, 1, 100)  # constant + small perturbation

# Naïve formula (catastrophic cancellation)
n_naive = len(x_const)
sum1 = np.sum(x_const)
sum2 = np.sum(x_const**2)
var_naive = (sum2 - sum1**2/n_naive) / (n_naive - 1)

# Welford
n_w, mean_w, M2_w = 0, 0.0, 0.0
for x in x_const:
    n_w, mean_w, M2_w = welford_update(n_w, mean_w, M2_w, x)
var_welford = M2_w / (n_w - 1)

var_true = np.var(x_const - 1e8, ddof=1)  # ground truth

print(f'\nNear-constant data (values ≈ 10^8, noise σ=1):')
print(f'  True variance:    {var_true:.6f}')
print(f'  Naïve formula:    {var_naive:.6f} (may be inaccurate due to cancellation)')
print(f'  Welford:          {var_welford:.6f}')
print('Welford is numerically stable; naïve formula suffers catastrophic cancellation.')

6. Missing Data — Imputation Strategies

Different imputation methods have different effects on the descriptive statistics of the filled dataset.

Code cell 50

# === 6.4 Missing Data Imputation ===

np.random.seed(42)
n = 500
x_true = np.random.normal(50, 15, n)

# Introduce MCAR missingness (20% missing)
missing_mask = np.random.rand(n) < 0.20
x_observed = x_true.copy().astype(float)
x_observed[missing_mask] = np.nan

# Three imputation strategies
x_mean_imp   = np.where(np.isnan(x_observed), np.nanmean(x_observed), x_observed)
x_median_imp = np.where(np.isnan(x_observed), np.nanmedian(x_observed), x_observed)

# Random imputation (draw from observed distribution)
obs_vals = x_observed[~np.isnan(x_observed)]
random_fill = np.random.choice(obs_vals, size=missing_mask.sum(), replace=True)
x_random_imp = x_observed.copy()
x_random_imp[np.isnan(x_observed)] = random_fill

print('Effect of imputation on descriptive statistics:')
print(f'{'Method':<16} {'Mean':>8} {'Std':>8} {'Skew':>8} {'Kurt':>8}')
print('-' * 50)
for name, arr in [('True data', x_true), ('Observed (no NA)', obs_vals),
                   ('Mean impute', x_mean_imp), ('Median impute', x_median_imp),
                   ('Random impute', x_random_imp)]:
    print(f'{name:<16} {np.mean(arr):>8.3f} {np.std(arr,ddof=1):>8.3f} '
          f'{stats.skew(arr):>8.4f} {stats.kurtosis(arr):>8.4f}')

print('\nNOTE: Mean imputation artificially reduces variance and kurtosis.')
print('NOTE: Random imputation best preserves the distributional shape.')

Summary Statistics Cross-Reference

Quick reference: which NumPy/SciPy function computes each statistic.

# All with ddof=1 for unbiased sample statistics:
mean   = np.mean(x)
median = np.median(x)
var    = np.var(x, ddof=1)         # Bessel's correction
std    = np.std(x, ddof=1)
skew   = scipy.stats.skew(x)       # Fisher's g1
kurt   = scipy.stats.kurtosis(x)   # Excess (g2 = raw - 3)
q1,q3  = np.percentile(x, [25,75])
iqr    = q3 - q1
mad    = np.median(np.abs(x - np.median(x)))
r      = np.corrcoef(x, y)[0,1]    # Pearson
rho    = scipy.stats.spearmanr(x,y).statistic
Sigma  = np.cov(X.T, ddof=1)       # X shape (n,d)

Next: §02 Estimation Theory

Code cell 52

# === Final Verification: All Core Statistics ===

np.random.seed(42)
x = np.random.normal(10, 3, 500)

results = {
    'mean':    np.mean(x),
    'median':  np.median(x),
    'var':     np.var(x, ddof=1),
    'std':     np.std(x, ddof=1),
    'skew':    stats.skew(x),
    'kurt':    stats.kurtosis(x),
    'IQR':     np.percentile(x,75) - np.percentile(x,25),
    'MAD':     np.median(np.abs(x - np.median(x))),
}

print('Core descriptive statistics for N(10, 9), n=500:')
print(f'{'Statistic':<12} {'Value':>10} {'Theory':>10}')
print('-' * 35)
theory = {'mean':10,'median':10,'var':9,'std':3,'skew':0,'kurt':0,
          'IQR':3*1.349,'MAD':3*0.6745}
for k, v in results.items():
    print(f'{k:<12} {v:>10.4f} {theory[k]:>10.4f}')

# Sanity checks
assert abs(np.mean(x) - 10) < 0.3
assert abs(np.std(x, ddof=1) - 3) < 0.3
assert abs(stats.skew(x)) < 0.3
assert abs(stats.kurtosis(x)) < 0.5
print('\nPASS - All sample statistics converge to theoretical values for large n')
print('\nAll theory notebook cells completed successfully.')