Theory Notebook
Converted from
theory.ipynbfor web reading.
Positive Definite Matrices — Theory Notebook
"Positive definiteness is the matrix condition that makes everything work."
Interactive exploration of positive definite matrices, Cholesky decomposition, Schur complements, log-determinants, Gram matrices, and applications in ML.
Sections covered:
- Quadratic Forms and Geometry
- Characterizations of PD
- Cholesky Decomposition (from scratch)
- LDL^T Factorization
- Matrix Square Root and Whitening
- Schur Complement
- Log-Determinant
- Gram Matrices and Kernels
- PSD Cone and SDP
- Machine Learning Applications
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import scipy.linalg as la
from scipy import stats
try:
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style='whitegrid', palette='colorblind')
HAS_SNS = True
except ImportError:
plt.style.use('seaborn-v0_8-whitegrid')
HAS_SNS = False
mpl.rcParams.update({
'figure.figsize': (10, 6),
'figure.dpi': 120,
'font.size': 13,
'axes.titlesize': 15,
'axes.labelsize': 13,
})
COLORS = {
'primary': '#0077BB',
'secondary': '#EE7733',
'tertiary': '#009988',
'error': '#CC3311',
'neutral': '#555555',
'highlight': '#EE3377',
}
HAS_MPL = True
except ImportError:
HAS_MPL = False
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete.')
1. Quadratic Forms and Geometry
A quadratic form associated with symmetric matrix is . We visualize the level sets to see the geometry of positive definite, semidefinite, and indefinite matrices.
Code cell 5
# === 1.1 Quadratic Form Level Sets ===
def plot_quadratic_form(A, ax, title, xlim=(-3,3), ylim=(-3,3)):
x = np.linspace(xlim[0], xlim[1], 400)
y = np.linspace(ylim[0], ylim[1], 400)
X, Y = np.meshgrid(x, y)
# Q(x,y) = a*x^2 + 2b*x*y + d*y^2
a, b, d = A[0,0], A[0,1], A[1,1]
Z = a*X**2 + 2*b*X*Y + d*Y**2
if HAS_MPL:
# Contour at Q=1
ax.contour(X, Y, Z, levels=[0, 1], colors=[COLORS['neutral'], COLORS['primary']],
linewidths=[0.5, 2.5])
ax.contourf(X, Y, Z, levels=20, cmap='plasma', alpha=0.4)
ax.axhline(0, color='k', linewidth=0.5)
ax.axvline(0, color='k', linewidth=0.5)
ax.set_xlim(xlim); ax.set_ylim(ylim)
ax.set_aspect('equal')
ax.set_title(title)
ax.set_xlabel('$x_1$'); ax.set_ylabel('$x_2$')
eigs = np.linalg.eigvalsh(A)
ax.text(0.05, 0.95, f'$\\lambda$ = {eigs[0]:.2f}, {eigs[1]:.2f}',
transform=ax.transAxes, va='top', fontsize=10)
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Quadratic Form Level Sets: PD, PSD, Indefinite', fontsize=14)
A_pd = np.array([[4., 1.], [1., 2.]]) # PD
A_psd = np.array([[1., 0.], [0., 0.]]) # PSD (rank 1)
A_indef = np.array([[1., 2.], [2., -1.]]) # Indefinite
plot_quadratic_form(A_pd, axes[0], 'PD: $A \\succ 0$\n(ellipse)')
plot_quadratic_form(A_psd, axes[1], 'PSD: $A \\succeq 0$\n(degenerate)')
plot_quadratic_form(A_indef, axes[2], 'Indefinite\n(hyperbola)', xlim=(-2,2), ylim=(-2,2))
fig.tight_layout()
plt.show()
print('Figure: level sets of quadratic forms')
else:
print('matplotlib not available; skipping plot')
Code cell 6
# === 1.2 Ellipse Semi-Axes from Eigenvalues ===
A = np.array([[4., 1.], [1., 2.]])
eigvals, eigvecs = np.linalg.eigh(A)
print('Matrix A:')
print(A)
print(f'\nEigenvalues: {eigvals}')
print(f'Semi-axes of ellipse Q=1: {1/np.sqrt(eigvals)}')
print(f'Axes aligned with eigenvectors:')
for i, (lam, v) in enumerate(zip(eigvals, eigvecs.T)):
print(f' lambda_{i+1}={lam:.4f}, axis direction={v}')
# The larger eigenvalue => smaller semi-axis (more compressed)
print(f'\nLarger lambda ({eigvals[1]:.2f}) => shorter semi-axis ({1/np.sqrt(eigvals[1]):.4f})')
print(f'Smaller lambda ({eigvals[0]:.2f}) => longer semi-axis ({1/np.sqrt(eigvals[0]):.4f})')
2. Characterizations of Positive Definiteness
We verify all four equivalent characterizations of :
- Quadratic form: for all
- Spectral: all eigenvalues
- Sylvester: all leading principal minors
- Cholesky: factorization exists with positive diagonal
Code cell 8
# === 2.1 Four Equivalent Characterizations ===
def check_pd_quadratic(A, n_random=1000):
"""Test quadratic form positivity on random vectors."""
X = np.random.randn(n_random, A.shape[0])
qvals = np.einsum('ni,ij,nj->n', X, A, X)
return np.all(qvals > 0)
def check_pd_spectral(A):
return np.all(np.linalg.eigvalsh(A) > 0)
def check_pd_sylvester(A):
n = A.shape[0]
return all(np.linalg.det(A[:k,:k]) > 0 for k in range(1, n+1))
def check_pd_cholesky(A):
try:
np.linalg.cholesky(A)
return True
except np.linalg.LinAlgError:
return False
test_matrices = {
'PD: I': np.eye(3),
'PD: diag(1,2,3)': np.diag([1.,2.,3.]),
'PD: general': np.array([[4.,2.,0.],[2.,5.,1.],[0.,1.,3.]]),
'PSD (rank 2)': np.array([[1.,0.,0.],[0.,1.,0.],[0.,0.,0.]]),
'Indefinite': np.array([[1.,0.,0.],[0.,1.,0.],[0.,0.,-1.]]),
}
print(f'{'Matrix':<20} {'Quadratic':>10} {'Spectral':>10} {'Sylvester':>10} {'Cholesky':>10}')
print('-' * 65)
for name, A in test_matrices.items():
q = check_pd_quadratic(A)
s = check_pd_spectral(A)
sy = check_pd_sylvester(A)
ch = check_pd_cholesky(A)
print(f'{name:<20} {str(q):>10} {str(s):>10} {str(sy):>10} {str(ch):>10}')
Code cell 9
# === 2.2 Sylvester's Criterion — Finding the PD Range ===
# A(alpha) = [[1, alpha, 0],[alpha, 2, alpha],[0, alpha, 4]]
# Find alpha values for PD
alphas = np.linspace(-2, 2, 500)
min_eigvals = []
delta1_vals = []
delta2_vals = []
delta3_vals = []
for a in alphas:
A = np.array([[1., a, 0.],[a, 2., a],[0., a, 4.]])
min_eigvals.append(np.linalg.eigvalsh(A).min())
delta1_vals.append(A[0,0])
delta2_vals.append(np.linalg.det(A[:2,:2]))
delta3_vals.append(np.linalg.det(A))
min_eigvals = np.array(min_eigvals)
delta2_vals = np.array(delta2_vals)
delta3_vals = np.array(delta3_vals)
pd_region = min_eigvals > 0
print(f'PD for alpha in approx [{alphas[pd_region].min():.4f}, {alphas[pd_region].max():.4f}]')
# Analytical: Delta2 = 2 - alpha^2 > 0 => |alpha| < sqrt(2)
# Delta3 = 4*Delta2 - alpha^2*2 = 8-4*alpha^2-2*alpha^2=8-6*alpha^2 > 0 => |alpha| < sqrt(4/3)
print(f'Analytical bound from Delta2: |alpha| < sqrt(2) = {np.sqrt(2):.4f}')
print(f'Analytical bound from Delta3: |alpha| < sqrt(4/3) = {np.sqrt(4/3):.4f}')
print(f'Binding constraint: |alpha| < {np.sqrt(4/3):.4f}')
if HAS_MPL:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(alphas, min_eigvals, color=COLORS['primary'], label='min eigenvalue')
ax1.axhline(0, color=COLORS['neutral'], linestyle='--', linewidth=0.8)
ax1.fill_between(alphas, min_eigvals, 0,
where=(min_eigvals > 0), alpha=0.2, color=COLORS['tertiary'],
label='PD region')
ax1.set_title(r'Min eigenvalue of $A(\alpha)$')
ax1.set_xlabel(r'$\alpha$'); ax1.set_ylabel(r'$\lambda_{\min}$')
ax1.legend()
ax2.plot(alphas, delta2_vals, color=COLORS['primary'], label=r'$\Delta_2 = 2-\alpha^2$')
ax2.plot(alphas, delta3_vals, color=COLORS['secondary'], label=r'$\Delta_3$')
ax2.axhline(0, color=COLORS['neutral'], linestyle='--', linewidth=0.8)
ax2.set_title('Sylvester leading principal minors')
ax2.set_xlabel(r'$\alpha$'); ax2.set_ylabel('Minor value')
ax2.legend()
fig.tight_layout()
plt.show()
3. Cholesky Decomposition (From Scratch)
We implement the Cholesky algorithm from the formulas:
Code cell 11
# === 3.1 Cholesky Algorithm (Pure NumPy) ===
def cholesky_scratch(A):
"""Compute Cholesky factor L (lower triangular, pos. diag.) with A = L @ L.T."""
A = np.array(A, dtype=float)
n = A.shape[0]
L = np.zeros((n, n))
for j in range(n):
# Diagonal entry
s = A[j, j] - np.sum(L[j, :j]**2)
if s <= 0:
raise ValueError(f'Matrix is not positive definite: pivot {s:.6e} at j={j}')
L[j, j] = np.sqrt(s)
# Sub-diagonal entries in column j
for i in range(j+1, n):
L[i, j] = (A[i, j] - np.sum(L[i, :j] * L[j, :j])) / L[j, j]
return L
# Test on the textbook example A = [[4,2,0],[2,5,1],[0,1,3]]
A_test = np.array([[4., 2., 0.], [2., 5., 1.], [0., 1., 3.]])
L_scratch = cholesky_scratch(A_test)
L_numpy = np.linalg.cholesky(A_test)
print('Cholesky factor L (our implementation):')
print(L_scratch)
print('\nCholesky factor L (NumPy):')
print(L_numpy)
# Verify
recon = L_scratch @ L_scratch.T
ok = np.allclose(recon, A_test)
match = np.allclose(L_scratch, L_numpy)
print(f'\nPASS - L @ L.T = A: {ok}')
print(f'PASS - Matches NumPy: {match}')
Code cell 12
# === 3.2 Solving A x = b via Cholesky ===
from scipy.linalg import solve_triangular
def cholesky_solve(A, b):
"""Solve Ax=b using Cholesky: A = LL^T."""
L = cholesky_scratch(A)
# Forward: L y = b
y = solve_triangular(L, b, lower=True)
# Backward: L^T x = y
x = solve_triangular(L.T, y, lower=False)
return x
A = np.array([[4., 2., 0.], [2., 5., 1.], [0., 1., 3.]])
b = np.array([1., 2., 3.])
x_chol = cholesky_solve(A, b)
x_numpy = np.linalg.solve(A, b)
print(f'Solution via Cholesky: {x_chol}')
print(f'Solution via NumPy: {x_numpy}')
print(f'Residual ||Ax-b|| = {np.linalg.norm(A @ x_chol - b):.2e}')
print(f'PASS - Cholesky solve matches: {np.allclose(x_chol, x_numpy)}')
Code cell 13
# === 3.3 Cholesky Failure = Matrix Not PD ===
# Indefinite matrix — Cholesky should fail
A_indef = np.array([[1., 3.], [3., 1.]])
print('Testing Cholesky on indefinite matrix A = [[1,3],[3,1]]:')
print(f'Eigenvalues: {np.linalg.eigvalsh(A_indef)}')
print(f'det = {np.linalg.det(A_indef):.2f}')
try:
L = cholesky_scratch(A_indef)
print('Unexpectedly succeeded!')
except ValueError as e:
print(f'PASS - Failed as expected: {e}')
# Add jitter to fix near-indefinite matrix
A_jitter = A_indef + 3.1 * np.eye(2) # make it PD
print(f'\nAfter adding 3.1*I: eigenvalues = {np.linalg.eigvalsh(A_jitter)}')
L_ok = cholesky_scratch(A_jitter)
print(f'PASS - Cholesky succeeds after jitter: {np.allclose(L_ok @ L_ok.T, A_jitter)}')
Code cell 14
# === 3.4 Cholesky vs LU Complexity ===
import time
sizes = [100, 200, 400, 800]
times_chol = []
times_lu = []
for n in sizes:
# Generate random PD matrix
X = np.random.randn(n, n)
A = X.T @ X + n * np.eye(n) # guaranteed PD
reps = 3
t0 = time.perf_counter()
for _ in range(reps):
L = np.linalg.cholesky(A)
t_chol = (time.perf_counter() - t0) / reps
t0 = time.perf_counter()
for _ in range(reps):
P, Llu, U = la.lu(A)
t_lu = (time.perf_counter() - t0) / reps
times_chol.append(t_chol * 1000)
times_lu.append(t_lu * 1000)
print(f'n={n:4d}: Cholesky={t_chol*1000:.2f}ms LU={t_lu*1000:.2f}ms ratio={t_lu/t_chol:.2f}x')
print('\nTheoretical: Cholesky ~ n^3/3, LU ~ 2n^3/3. Ratio -> 2x for large n.')
4. LDL Factorization
LDL decomposes where is unit lower triangular and is diagonal. This avoids square roots and is preferable when may be indefinite.
Code cell 16
# === 4.1 LDL^T Algorithm ===
def ldlt_scratch(A):
"""Compute LDL^T factorization. Returns (L, d) where d=diag(D)."""
A = np.array(A, dtype=float)
n = A.shape[0]
L = np.eye(n)
d = np.zeros(n)
for j in range(n):
# v[k] = d[k] * L[j, k] for k < j
v = d[:j] * L[j, :j]
d[j] = A[j, j] - np.dot(L[j, :j], v)
if abs(d[j]) < 1e-14:
raise ValueError(f'Zero pivot at j={j}: matrix may be singular')
for i in range(j+1, n):
L[i, j] = (A[i, j] - np.dot(L[i, :j], v)) / d[j]
return L, d
A = np.array([[4., 2., 1.], [2., 5., 3.], [1., 3., 6.]])
L, d = ldlt_scratch(A)
D = np.diag(d)
print('L (unit lower triangular):')
print(L)
print('\nd (diagonal of D):', d)
recon = L @ D @ L.T
ok = np.allclose(recon, A)
print(f'\nPASS - L @ D @ L.T = A: {ok}')
print(f'Reconstruction error: {np.max(np.abs(recon - A)):.2e}')
# Compare with scipy
L_sp, D_sp, _ = la.ldl(A)
print(f'PASS - Matches scipy.linalg.ldl: {np.allclose(L, L_sp) and np.allclose(np.diag(d), D_sp)}')
Code cell 17
# === 4.2 Pivots and PD Characterization ===
def test_pd_via_ldlt(A):
try:
L, d = ldlt_scratch(A)
return np.all(d > 0), d
except Exception as e:
return False, None
matrices = {
'PD': np.array([[4.,2.],[2.,3.]]),
'PSD': np.array([[1.,0.],[0.,0.]]),
'Indef': np.array([[1.,2.],[2.,1.]]),
'ND': np.array([[-1.,0.],[0.,-2.]]),
}
print(f'{'Matrix':<10} {'All d>0':>10} {'Pivots'}')
print('-'*45)
for name, A in matrices.items():
is_pd, pivots = test_pd_via_ldlt(A)
pivot_str = str(np.round(pivots, 4)) if pivots is not None else 'N/A'
print(f'{name:<10} {str(is_pd):>10} {pivot_str}')
5. Matrix Square Root and Whitening
The symmetric PSD square root satisfies and is the unique symmetric PSD solution. It differs from the Cholesky factor (which is lower triangular).
Code cell 19
# === 5.1 PSD Square Root via Eigendecomposition ===
def psd_sqrt(A):
"""Compute symmetric PSD square root A^{1/2}."""
eigvals, Q = np.linalg.eigh(A)
if np.any(eigvals < -1e-10):
raise ValueError(f'Matrix not PSD: min eigenvalue = {eigvals.min():.2e}')
eigvals = np.maximum(eigvals, 0) # clip small negatives to 0
return Q @ np.diag(np.sqrt(eigvals)) @ Q.T
A = np.array([[4., 2.], [2., 3.]])
A_sqrt = psd_sqrt(A)
L_chol = np.linalg.cholesky(A)
print('A:')
print(A)
print('\nPSD square root A^{1/2}:')
print(A_sqrt)
print('\nCholesky factor L (NOT the same!):')
print(L_chol)
# Verify
print(f'\nA^{{1/2}} @ A^{{1/2}} = A: {np.allclose(A_sqrt @ A_sqrt, A)}')
print(f'L @ L.T = A: {np.allclose(L_chol @ L_chol.T, A)}')
print(f'A^{{1/2}} is symmetric: {np.allclose(A_sqrt, A_sqrt.T)}')
print(f'L is symmetric: {np.allclose(L_chol, L_chol.T)}')
Code cell 20
# === 5.2 ZCA Whitening vs Cholesky Whitening ===
np.random.seed(42)
# Generate correlated 2D data
Sigma = np.array([[4., 2.], [2., 3.]])
L_chol = np.linalg.cholesky(Sigma)
X_raw = (L_chol @ np.random.randn(2, 500)).T # 500 samples from N(0, Sigma)
# Covariance of raw data
S_raw = np.cov(X_raw.T)
print('Sample covariance (should be ~Sigma):')
print(np.round(S_raw, 3))
# ZCA whitening: W = Sigma^{-1/2}
Sigma_sqrt_inv = np.linalg.inv(psd_sqrt(Sigma))
X_zca = (Sigma_sqrt_inv @ X_raw.T).T
S_zca = np.cov(X_zca.T)
print('\nZCA whitened covariance (should be ~I):')
print(np.round(S_zca, 3))
# Cholesky whitening: W = L^{-1}
X_chol = (np.linalg.inv(L_chol) @ X_raw.T).T
S_chol = np.cov(X_chol.T)
print('\nCholesky whitened covariance (should be ~I):')
print(np.round(S_chol, 3))
print(f'\nPASS - ZCA gives identity: {np.allclose(S_zca, np.eye(2), atol=0.1)}')
print(f'PASS - Cholesky gives identity: {np.allclose(S_chol, np.eye(2), atol=0.1)}')
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Whitening Transforms', fontsize=14)
for ax, X, ttl in zip(axes,
[X_raw, X_zca, X_chol],
['Raw data (correlated)', 'ZCA whitened', 'Cholesky whitened']):
ax.scatter(X[:,0], X[:,1], alpha=0.3, s=5, color=COLORS['primary'])
ax.set_aspect('equal')
ax.set_title(ttl)
ax.set_xlabel('$z_1$'); ax.set_ylabel('$z_2$')
fig.tight_layout()
plt.show()
6. Schur Complement
The Schur complement of the block in is the key to block matrix PD testing, inversion, and Gaussian conditioning.
Code cell 22
# === 6.1 Schur Complement and Block PD ===
def schur_complement(M, p):
"""Schur complement of A=M[:p,:p] in M."""
A = M[:p, :p]
B = M[:p, p:]
C = M[p:, :p]
D = M[p:, p:]
return D - C @ np.linalg.inv(A) @ B
# Example: 3x3 block matrix
A_block = np.array([[4., 1.], [1., 3.]])
B_block = np.array([[1.], [2.]])
D_block = np.array([[5.]])
M = np.block([[A_block, B_block], [B_block.T, D_block]])
print('Block matrix M:')
print(M)
S = schur_complement(M, 2)
print(f'\nSchur complement S = D - B^T A^-1 B = {S[0,0]:.6f}')
# PD iff A succ 0 and S succ 0
A_pd = np.all(np.linalg.eigvalsh(A_block) > 0)
S_pd = S[0,0] > 0
M_pd_check = np.all(np.linalg.eigvalsh(M) > 0)
print(f'\nA succ 0: {A_pd}')
print(f'S succ 0: {S_pd}')
print(f'M succ 0 (direct check): {M_pd_check}')
print(f'PASS - Schur criterion matches direct: {A_pd and S_pd == M_pd_check}')
# Determinant via Schur
det_via_schur = np.linalg.det(A_block) * S[0,0]
det_direct = np.linalg.det(M)
print(f'\ndet(M) via Schur: {det_via_schur:.6f}')
print(f'det(M) direct: {det_direct:.6f}')
print(f'PASS - Match: {np.allclose(det_via_schur, det_direct)}')
Code cell 23
# === 6.2 Woodbury Matrix Identity ===
def woodbury(A_inv, U, C_inv, V):
"""(A + U C V)^{-1} = A_inv - A_inv U (C_inv + V A_inv U)^{-1} V A_inv."""
M = C_inv + V @ A_inv @ U
return A_inv - A_inv @ U @ np.linalg.inv(M) @ V @ A_inv
# Test: A = 10*I, U = random n x k, C = I_k, V = U.T
n, k = 100, 5
A = 10 * np.eye(n)
A_inv = 0.1 * np.eye(n)
U = np.random.randn(n, k)
C = np.eye(k)
C_inv = np.eye(k)
V = U.T
# Direct: invert n x n matrix
M_direct = A + U @ C @ V # = 10I + UU^T
inv_direct = np.linalg.inv(M_direct)
# Woodbury: invert k x k matrix
inv_woodbury = woodbury(A_inv, U, C_inv, V)
err = np.max(np.abs(inv_direct - inv_woodbury))
print(f'n={n}, k={k} (low-rank update)')
print(f'Max error |inv_direct - inv_woodbury| = {err:.2e}')
print(f'PASS - Woodbury identity verified: {err < 1e-10}')
print(f'\nWoodbury advantage: invert {k}x{k} instead of {n}x{n} matrix')
7. Log-Determinant
The log-determinant (via Cholesky) is numerically stable and essential for Gaussian log-likelihoods, normalizing flows, and GP training.
Code cell 25
# === 7.1 Log-Det via Cholesky (Stable) ===
def log_det_cholesky(A):
"""Numerically stable log-determinant via Cholesky."""
L = np.linalg.cholesky(A)
return 2.0 * np.sum(np.log(np.diag(L)))
# Test on a 5x5 PD matrix
np.random.seed(42)
X = np.random.randn(5, 5)
A = X.T @ X + 5 * np.eye(5)
ld_chol = log_det_cholesky(A)
ld_eig = np.sum(np.log(np.linalg.eigvalsh(A)))
ld_direct = np.log(np.linalg.det(A))
print(f'log det via Cholesky: {ld_chol:.8f}')
print(f'log det via eigvals: {ld_eig:.8f}')
print(f'log det via det(): {ld_direct:.8f}')
print(f'PASS - All agree: {np.allclose([ld_chol, ld_eig], ld_direct)}')
# Speed comparison for large n
import time
n = 500
B = np.random.randn(n, n)
A_big = B.T @ B + n * np.eye(n)
reps = 5
t0 = time.perf_counter()
for _ in range(reps): ld_chol_big = log_det_cholesky(A_big)
t_chol = (time.perf_counter()-t0)/reps
t0 = time.perf_counter()
for _ in range(reps): ld_det_big = np.log(np.linalg.det(A_big))
t_det = (time.perf_counter()-t0)/reps
print(f'\nn={n}: Cholesky={t_chol*1000:.2f}ms det()={t_det*1000:.2f}ms')
print(f'PASS - Results agree: {np.allclose(ld_chol_big, ld_det_big, rtol=1e-5)}')
Code cell 26
# === 7.2 Log-Det Gradient Numerical Verification ===
# Verify: d/dA log det A = A^{-1} using finite differences
A = np.array([[4., 2., 1.], [2., 5., 3.], [1., 3., 6.]])
A_inv = np.linalg.inv(A)
h = 1e-5
fd_grad = np.zeros_like(A)
for i in range(A.shape[0]):
for j in range(A.shape[1]):
E = np.zeros_like(A)
E[i, j] = h
A_plus = A + E
A_minus = A - E
# Only compute for valid (PD) perturbations
if np.all(np.linalg.eigvalsh(A_plus) > 0):
fd_grad[i, j] = (log_det_cholesky(A_plus) - log_det_cholesky(A_minus)) / (2*h)
print('Analytical gradient A^{-1}:')
print(np.round(A_inv, 6))
print('\nFinite difference gradient:')
print(np.round(fd_grad, 6))
err = np.max(np.abs(A_inv - fd_grad))
print(f'\nMax error: {err:.2e}')
print(f'PASS - Gradient matches A^{{-1}}: {err < 1e-4}')
Code cell 27
# === 7.3 Gaussian Log-Likelihood ===
np.random.seed(42)
# Ground truth covariance
Sigma_true = np.array([[4., 2.], [2., 3.]])
L_true = np.linalg.cholesky(Sigma_true)
mu_true = np.array([1., 0.])
# Generate 100 samples
n_samples = 100
eps = np.random.randn(2, n_samples)
X_data = (mu_true[:, None] + L_true @ eps).T # shape (n, 2)
def gaussian_log_lik(X, mu, Sigma):
"""Multivariate Gaussian log-likelihood."""
n, d = X.shape
L = np.linalg.cholesky(Sigma)
log_det_S = 2.0 * np.sum(np.log(np.diag(L)))
diff = X - mu
# mahal = sum_i (x_i - mu)^T Sigma^{-1} (x_i - mu)
Z = la.solve_triangular(L, diff.T, lower=True)
mahal = np.sum(Z**2)
return -0.5 * (n * d * np.log(2*np.pi) + n * log_det_S + mahal)
ll_true = gaussian_log_lik(X_data, mu_true, Sigma_true)
ll_identity = gaussian_log_lik(X_data, mu_true, np.eye(2))
print(f'Log-likelihood at true Sigma: {ll_true:.4f}')
print(f'Log-likelihood at I (wrong Sigma): {ll_identity:.4f}')
print(f'True Sigma is better: {ll_true > ll_identity}')
# MLE covariance estimate
mu_hat = X_data.mean(axis=0)
Sigma_hat = np.cov(X_data.T) * (n_samples - 1) / n_samples # MLE (biased)
ll_mle = gaussian_log_lik(X_data, mu_hat, Sigma_hat)
print(f'Log-likelihood at MLE Sigma: {ll_mle:.4f} (MLE >= true by design)')
8. Gram Matrices and Kernel Matrices
Every Gram matrix is PSD. We explore this and build kernel matrices for the RBF, polynomial, and linear kernels.
Code cell 29
# === 8.1 Gram Matrix Properties ===
np.random.seed(42)
# Full rank case: n < d
X_full = np.random.randn(5, 10) # 5 samples, 10 features
G_full = X_full @ X_full.T
print('X shape:', X_full.shape, '(n < d)')
print('G = X X^T, shape:', G_full.shape)
eigs = np.linalg.eigvalsh(G_full)
print(f'Eigenvalues of G: {np.round(eigs, 4)}')
print(f'rank(G) = {np.linalg.matrix_rank(G_full)} (= rank(X) = {np.linalg.matrix_rank(X_full)})')
print(f'G is PSD: {np.all(eigs >= -1e-10)}')
print(f'G is PD: {np.all(eigs > 0)} (expected False since n<d... wait)')
# Overdetermined: n > d
X_over = np.random.randn(10, 5) # 10 samples, 5 features
G_over = X_over @ X_over.T
eigs_over = np.linalg.eigvalsh(G_over)
print(f'\nX shape: {X_over.shape} (n > d)')
print(f'rank(G) = {np.linalg.matrix_rank(G_over)} <= d={5}')
print(f'Smallest eigenvalue: {eigs_over.min():.4f} (near 0 for rank-5 matrix in 10D)')
Code cell 30
# === 8.2 Kernel Matrices ===
def rbf_kernel(X, Y=None, length_scale=1.0):
if Y is None: Y = X
diffs = X[:, None, :] - Y[None, :, :] # (n, m, d)
sq_dists = np.sum(diffs**2, axis=-1)
return np.exp(-sq_dists / (2 * length_scale**2))
def linear_kernel(X, Y=None):
if Y is None: Y = X
return X @ Y.T
def poly_kernel(X, Y=None, degree=2, c=1.0):
if Y is None: Y = X
return (X @ Y.T + c)**degree
np.random.seed(7)
X = np.random.randn(6, 3) # 6 points in R^3
kernels = {
'RBF (l=1)': rbf_kernel(X),
'Linear': linear_kernel(X),
'Poly (d=2)': poly_kernel(X),
}
print(f'{'Kernel':<15} {'Min eigval':>12} {'Is PSD':>8} {'Is PD':>8}')
print('-'*48)
for name, K in kernels.items():
eigs = np.linalg.eigvalsh(K)
is_psd = np.all(eigs >= -1e-10)
is_pd = np.all(eigs > 1e-10)
print(f'{name:<15} {eigs.min():>12.6f} {str(is_psd):>8} {str(is_pd):>8}')
Code cell 31
# === 8.3 Attention as Gram Matrix ===
np.random.seed(42)
# Simulate scaled dot-product attention scores
n_tokens, d_k = 8, 16
Q = np.random.randn(n_tokens, d_k)
K = np.random.randn(n_tokens, d_k)
# Score matrix
S = Q @ K.T / np.sqrt(d_k)
# Softmax attention weights
def softmax_rows(X):
X_shifted = X - X.max(axis=-1, keepdims=True)
exp_X = np.exp(X_shifted)
return exp_X / exp_X.sum(axis=-1, keepdims=True)
A_weights = softmax_rows(S)
print(f'Q, K shapes: {Q.shape}, {K.shape}')
print(f'Score matrix S = QK^T/sqrt(d_k), shape: {S.shape}')
print(f'Score range: [{S.min():.2f}, {S.max():.2f}]')
print(f'Attention weights (row-stochastic):')
print(np.round(A_weights[:3, :3], 3), '...')
print(f'Row sums = 1: {np.allclose(A_weights.sum(axis=1), 1)}')
# Self-attention (Q=K): score matrix is symmetric -> Gram matrix
S_self = Q @ Q.T / np.sqrt(d_k)
print(f'\nSelf-attention (Q=K): S is symmetric: {np.allclose(S_self, S_self.T)}')
eigs_self = np.linalg.eigvalsh(S_self)
print(f'S_self eigenvalues min={eigs_self.min():.4f} (can be negative)')
print('Note: QQ^T/sqrt(d_k) is PSD; Q scores matrix may have negative eigs via softmax')
9. The PSD Cone and Semidefinite Programming
The set of PSD matrices is a convex cone. SDP optimizes a linear objective over this cone intersected with an affine constraint set.
Code cell 33
# === 9.1 Convexity of the PSD Cone ===
# Verify: convex combination of PSD matrices is PSD
np.random.seed(42)
def random_psd(n, rank=None):
if rank is None: rank = n
X = np.random.randn(n, rank)
return X @ X.T + 0.1 * np.eye(n) # PD
A = random_psd(4)
B = random_psd(4)
lambdas = [0.0, 0.25, 0.5, 0.75, 1.0]
print('Verifying convexity of PSD cone: lambda*A + (1-lambda)*B is PSD for all lambda in [0,1]')
print(f'{'lambda':>8} {'min eigval':>12} {'is PSD':>8}')
for lam in lambdas:
C = lam * A + (1 - lam) * B
min_eig = np.linalg.eigvalsh(C).min()
is_psd = min_eig >= -1e-10
print(f'{lam:>8.2f} {min_eig:>12.6f} {str(is_psd):>8}')
Code cell 34
# === 9.2 SDP Example: Minimum Trace PD Matrix ===
# Minimize trace(X) subject to X >= 0 (PSD), X_{11}=2, X_{22}=3
# This is a simple SDP solvable in closed form.
try:
import cvxpy as cp
HAS_CVXPY = True
except ImportError:
HAS_CVXPY = False
if HAS_CVXPY:
n = 3
X = cp.Variable((n, n), symmetric=True)
constraints = [
X >> 0, # X PSD
X[0, 0] == 2.0, # diagonal constraints
X[1, 1] == 3.0,
X[2, 2] == 1.0,
]
prob = cp.Problem(cp.Minimize(cp.trace(X)), constraints)
prob.solve(solver=cp.SCS)
print('SDP solution (minimum trace PSD matrix with fixed diagonal):')
print(np.round(X.value, 4))
print(f'Trace = {np.trace(X.value):.4f} (expected 2+3+1=6)')
print(f'Min eigenvalue: {np.linalg.eigvalsh(X.value).min():.4f} (should be >= 0)')
print('Solution: off-diagonals -> 0 to minimize trace while keeping diagonal fixed')
else:
print('cvxpy not installed. Install with: pip install cvxpy')
print('Manual solution for min-trace with fixed diagonal: off-diagonals = 0.')
X_manual = np.diag([2., 3., 1.])
print('X =', X_manual)
print(f'Trace = {np.trace(X_manual)}')
10. Applications in Machine Learning
We demonstrate five core ML applications of positive definite matrices: multivariate Gaussians, Fisher information, Gaussian process regression, Hessian analysis, and the VAE reparameterization trick.
Code cell 36
# === 10.1 Multivariate Gaussian via Cholesky ===
np.random.seed(42)
def sample_mvn(mu, Sigma, n_samples):
"""Sample from N(mu, Sigma) using Cholesky reparameterization."""
L = np.linalg.cholesky(Sigma)
d = len(mu)
eps = np.random.randn(d, n_samples) # isotropic noise
return mu[:, None] + L @ eps # correlated samples
# 2D Gaussian
mu = np.array([1., 2.])
Sigma = np.array([[4., 2.], [2., 3.]])
samples = sample_mvn(mu, Sigma, 2000).T # shape (2000, 2)
mu_hat = samples.mean(axis=0)
Sigma_hat = np.cov(samples.T)
print(f'True mu: {mu}')
print(f'Sample mu: {mu_hat.round(3)}')
print(f'\nTrue Sigma:\n{Sigma}')
print(f'Sample Sigma:\n{Sigma_hat.round(3)}')
print(f'\nPASS - Sample mean close: {np.allclose(mu_hat, mu, atol=0.15)}')
print(f'PASS - Sample cov close: {np.allclose(Sigma_hat, Sigma, atol=0.3)}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(7, 6))
ax.scatter(samples[:, 0], samples[:, 1], alpha=0.15, s=5, color=COLORS['primary'])
# Plot ellipse (1-sigma contour)
theta = np.linspace(0, 2*np.pi, 200)
eigvals, Q = np.linalg.eigh(Sigma)
circle = np.array([np.cos(theta), np.sin(theta)])
ellipse = Q @ np.diag(np.sqrt(eigvals)) @ circle
ax.plot(mu[0] + ellipse[0], mu[1] + ellipse[1],
color=COLORS['error'], linewidth=2.5, label='1-sigma ellipse')
ax.scatter(*mu, color=COLORS['highlight'], s=100, zorder=5, label='mean')
ax.set_title(r'Samples from $\mathcal{N}(\mu, \Sigma)$ via Cholesky')
ax.set_xlabel('$x_1$'); ax.set_ylabel('$x_2$')
ax.legend(); ax.set_aspect('equal')
fig.tight_layout(); plt.show()
Code cell 37
# === 10.2 Fisher Information Matrix for Gaussian ===
# For N(mu, sigma^2 * I), the FIM w.r.t. (mu, log_sigma) is block diagonal
# F_mu = (1/sigma^2) * I_d
# F_sigma = 2 * I
def fisher_gaussian_empirical(mu, sigma, n_samples=5000):
"""Empirical Fisher: E[score * score^T]."""
d = len(mu)
# Params: [mu, log_sigma]
X = np.random.randn(n_samples, d) * sigma + mu
scores = (X - mu) / sigma**2 # d/d_mu log p = (x-mu)/sigma^2
F_emp = scores.T @ scores / n_samples
F_theory = np.eye(d) / sigma**2
return F_emp, F_theory
np.random.seed(42)
d = 3
mu = np.zeros(d)
sigma = 2.0
F_emp, F_theory = fisher_gaussian_empirical(mu, sigma)
print(f'Theoretical FIM (1/sigma^2 * I, sigma={sigma}):')
print(np.round(F_theory, 4))
print(f'\nEmpirical FIM:')
print(np.round(F_emp, 4))
print(f'\nFIM is PSD: {np.all(np.linalg.eigvalsh(F_emp) >= -1e-10)}')
print(f'PASS - Empirical matches theoretical: {np.allclose(F_emp, F_theory, atol=0.05)}')
Code cell 38
# === 10.3 Gaussian Process Regression ===
np.random.seed(42)
# Training data
X_train = np.array([0., 1., 2., 3., 4.])
y_train = np.sin(X_train) + 0.1 * np.random.randn(len(X_train))
X_test = np.linspace(-0.5, 4.5, 100)
# RBF kernel
sigma_noise = 0.1
def rbf_1d(x1, x2, length_scale=1.0):
return np.exp(-(x1[:, None] - x2[None, :])**2 / (2 * length_scale**2))
K_nn = rbf_1d(X_train, X_train)
K_sn = rbf_1d(X_test, X_train)
K_ss = rbf_1d(X_test, X_test)
# Cholesky solve
K_noisy = K_nn + sigma_noise**2 * np.eye(len(X_train))
L = np.linalg.cholesky(K_noisy)
alpha = la.cho_solve((L, True), y_train)
# Predictive mean and variance
mu_pred = K_sn @ alpha
V = la.solve_triangular(L, K_sn.T, lower=True)
var_pred = np.diag(K_ss) - np.sum(V**2, axis=0)
std_pred = np.sqrt(np.maximum(var_pred, 0))
# Log marginal likelihood
log_ml = -0.5 * y_train @ alpha - np.sum(np.log(np.diag(L))) - 0.5 * len(y_train) * np.log(2*np.pi)
print(f'GP Regression: {len(X_train)} training points')
print(f'Log marginal likelihood: {log_ml:.4f}')
print(f'Predictive variances > 0: {np.all(var_pred >= -1e-10)}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(X_test, np.sin(X_test), '--', color=COLORS['neutral'],
linewidth=1.5, label='True function')
ax.plot(X_test, mu_pred, color=COLORS['primary'],
linewidth=2.5, label='GP mean')
ax.fill_between(X_test, mu_pred - 2*std_pred, mu_pred + 2*std_pred,
alpha=0.25, color=COLORS['primary'], label=r'$\pm 2\sigma$')
ax.scatter(X_train, y_train, color=COLORS['error'],
s=80, zorder=5, label='Training data')
ax.set_title('Gaussian Process Regression (RBF kernel)')
ax.set_xlabel('x'); ax.set_ylabel('f(x)')
ax.legend(loc='upper right')
fig.tight_layout(); plt.show()
Code cell 39
# === 10.4 Hessian Sharpness Visualization ===
# Compare 'flat' vs 'sharp' quadratic loss functions
# L(theta) = 0.5 * theta^T H theta (centered at 0)
# Sharp minimum: large eigenvalues
H_sharp = np.array([[100., 50.], [50., 80.]])
# Flat minimum: small eigenvalues
H_flat = np.array([[1., 0.5], [0.5, 0.8]])
eigs_sharp = np.linalg.eigvalsh(H_sharp)
eigs_flat = np.linalg.eigvalsh(H_flat)
print(f'Sharp Hessian eigenvalues: {eigs_sharp}')
print(f'Sharp sharpness lambda_max: {eigs_sharp.max():.1f}')
print(f'\nFlat Hessian eigenvalues: {eigs_flat}')
print(f'Flat sharpness lambda_max: {eigs_flat.max():.3f}')
# SAM perturbation
rho = 0.05 # perturbation radius
grad = np.array([1., 1.]) / np.sqrt(2)
delta_star = rho * grad / np.linalg.norm(grad)
def loss_sharp(theta): return 0.5 * theta @ H_sharp @ theta
def loss_flat(theta): return 0.5 * theta @ H_flat @ theta
theta = np.array([0.1, 0.1])
print(f'\nAt theta={theta}:')
print(f' Loss (sharp): {loss_sharp(theta):.4f} -> perturbed: {loss_sharp(theta + delta_star):.4f}')
print(f' Loss (flat): {loss_flat(theta):.4f} -> perturbed: {loss_flat(theta + delta_star):.4f}')
if HAS_MPL:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Sharp vs Flat Loss Landscapes', fontsize=14)
t = np.linspace(-0.5, 0.5, 200)
T1, T2 = np.meshgrid(t, t)
for ax, H, ttl in [(ax1, H_sharp, 'Sharp ($\\lambda_{max}$=' + f'{eigs_sharp.max():.0f}' + ')'),
(ax2, H_flat, 'Flat ($\\lambda_{max}$=' + f'{eigs_flat.max():.2f}' + ')')]:
Z = 0.5 * (T1**2 * H[0,0] + 2*T1*T2*H[0,1] + T2**2 * H[1,1])
c = ax.contourf(T1, T2, Z, levels=20, cmap='plasma')
ax.contour(T1, T2, Z, levels=10, colors='white', alpha=0.4, linewidths=0.7)
plt.colorbar(c, ax=ax)
ax.scatter(0, 0, color='white', s=100, zorder=5)
ax.set_title(ttl)
ax.set_xlabel(r'$\theta_1$'); ax.set_ylabel(r'$\theta_2$')
ax.set_aspect('equal')
fig.tight_layout(); plt.show()
Code cell 40
# === 10.5 VAE Reparameterization Trick ===
np.random.seed(42)
def vae_kl_diagonal(mu, log_var):
"""KL(N(mu, diag(exp(log_var))) || N(0,I))."""
return 0.5 * np.sum(np.exp(log_var) + mu**2 - 1 - log_var)
def vae_kl_full(mu, L):
"""KL(N(mu, LL^T) || N(0,I)) = 0.5 * (tr(LL^T) + mu^T mu - d - log det LL^T)."""
d = len(mu)
Sigma = L @ L.T
tr_Sigma = np.trace(Sigma)
mu_sq = mu @ mu
log_det = 2.0 * np.sum(np.log(np.abs(np.diag(L))))
return 0.5 * (tr_Sigma + mu_sq - d - log_det)
# Diagonal case
d = 4
mu = np.array([0.5, -0.3, 0.8, -0.1])
s = np.array([-0.2, 0.1, -0.5, 0.3]) # log_var
kl_diag = vae_kl_diagonal(mu, s)
print(f'KL (diagonal covariance): {kl_diag:.4f}')
print(f'(Should be > 0 unless mu=0, sigma=1)')
# Full covariance case
L_vae = np.array([[2., 0., 0., 0.],
[0.5, 1.5, 0., 0.],
[0.2, 0.3, 1., 0.],
[0.1, 0.4, 0.2, 0.8]])
kl_full = vae_kl_full(mu, L_vae)
print(f'\nKL (full Cholesky covariance): {kl_full:.4f}')
# Reparameterization
n_samples = 1000
eps = np.random.randn(d, n_samples)
z_samples = mu[:, None] + L_vae @ eps # shape (d, n_samples)
print(f'\nSample mean: {z_samples.mean(axis=1).round(3)}')
print(f'Expected: {mu}')
print(f'PASS - Reparameterization sampling: {np.allclose(z_samples.mean(axis=1), mu, atol=0.1)}')
11. Condition Number and Numerical Stability
The condition number measures how sensitive the Cholesky solve is to perturbations. Tikhonov regularization () improves conditioning.
Code cell 42
# === 11.1 Condition Number and Solve Accuracy ===
def test_condition(n, kappa_target):
"""Build PD matrix with given condition number and test Cholesky solve accuracy."""
np.random.seed(42)
# Eigenvalues: 1 to kappa_target
eigvals = np.exp(np.linspace(0, np.log(kappa_target), n))
Q, _ = np.linalg.qr(np.random.randn(n, n))
A = Q @ np.diag(eigvals) @ Q.T
x_true = np.random.randn(n)
b = A @ x_true
x_chol = np.linalg.solve(A, b)
rel_err = np.linalg.norm(x_chol - x_true) / np.linalg.norm(x_true)
kappa_actual = np.linalg.cond(A)
return kappa_actual, rel_err
print(f'{'Cond. number':>15} {'Relative error':>16} {'Digits lost':>12}')
print('-' * 50)
import math
for kappa in [1e2, 1e4, 1e6, 1e8, 1e10, 1e12]:
k_actual, err = test_condition(20, kappa)
digits = max(0, -math.log10(max(err, 1e-16)))
print(f'{k_actual:>15.2e} {err:>16.2e} {digits:>12.1f}')
print('\nRule: ~log10(kappa) digits of precision lost')
Code cell 43
# === 11.2 Tikhonov Regularization Improves Conditioning ===
np.random.seed(42)
n = 20
# Ill-conditioned matrix
eigvals = np.exp(np.linspace(0, 12, n)) # kappa ~ e^12 ~ 1.6e5
Q, _ = np.linalg.qr(np.random.randn(n, n))
A = Q @ np.diag(eigvals) @ Q.T
lambdas = [0, 1e-4, 1e-2, 1e0, 1e2]
print(f'Original condition number: {np.linalg.cond(A):.2e}')
print(f'\n{'lambda':>10} {'kappa(A+lI)':>14} {'kappa ratio':>12}')
print('-' * 40)
kappa0 = np.linalg.cond(A)
for lam in lambdas:
A_reg = A + lam * np.eye(n)
kappa = np.linalg.cond(A_reg)
print(f'{lam:>10.4f} {kappa:>14.2e} {kappa/kappa0:>12.4f}')
print('\nLarger lambda -> smaller condition number (better stability, more bias)')
12. Closure Properties and Algebraic Structure
We verify the algebraic closure properties of PSD matrices: which operations preserve PSD structure.
Code cell 45
# === 12.1 Schur (Hadamard) Product Theorem ===
# Element-wise product of PSD matrices is PSD
np.random.seed(42)
def random_psd(n):
X = np.random.randn(n, n)
return X.T @ X + np.eye(n)
n = 5
A = random_psd(n)
B = random_psd(n)
# Hadamard product
C_hadamard = A * B # element-wise
eigs_A = np.linalg.eigvalsh(A)
eigs_B = np.linalg.eigvalsh(B)
eigs_C = np.linalg.eigvalsh(C_hadamard)
print(f'A PSD: {np.all(eigs_A > 0)}')
print(f'B PSD: {np.all(eigs_B > 0)}')
print(f'A * B (Hadamard) PSD: {np.all(eigs_C >= -1e-10)}')
print(f'Min eigenvalue of A*B: {eigs_C.min():.4f}')
# Kronecker product
C_kron = np.kron(A[:3,:3], B[:3,:3])
eigs_kron = np.linalg.eigvalsh(C_kron)
print(f'\nA⊗B (Kronecker) PSD: {np.all(eigs_kron >= -1e-10)}')
# Sum
C_sum = A + B
eigs_sum = np.linalg.eigvalsh(C_sum)
print(f'A+B PSD: {np.all(eigs_sum > 0)}')
# Product (NOT necessarily PSD)
C_prod = A @ B
is_sym_prod = np.allclose(C_prod, C_prod.T)
print(f'A@B symmetric: {is_sym_prod} (product of PSD matrices may not be symmetric!)')
Code cell 46
# === 12.2 Congruence Preserves PD ===
# If A succ 0 and B is invertible, then B^T A B succ 0
np.random.seed(42)
A = np.array([[4., 2.], [2., 3.]])
B = np.array([[2., 1.], [3., 4.]]) # invertible
C = B.T @ A @ B
print('A (PD):')
print(A)
print(f'Eigenvalues: {np.linalg.eigvalsh(A)}')
print('\nB (invertible):')
print(B)
print(f'det(B) = {np.linalg.det(B):.2f}')
print('\nC = B^T A B (congruence):')
print(np.round(C, 4))
print(f'Eigenvalues: {np.linalg.eigvalsh(C)}')
print(f'\nPASS - Congruence preserves PD: {np.all(np.linalg.eigvalsh(C) > 0)}')
# Sylvester's law of inertia: signature is preserved under congruence
A_indef = np.array([[2., 3.], [3., 1.]]) # indefinite: det < 0
C_indef = B.T @ A_indef @ B
print(f'\nA_indef signature (pos/neg/zero eigvals): ({sum(e>0 for e in np.linalg.eigvalsh(A_indef))}, {sum(e<0 for e in np.linalg.eigvalsh(A_indef))}, 0)')
print(f'C_indef signature: ({sum(e>0 for e in np.linalg.eigvalsh(C_indef))}, {sum(e<0 for e in np.linalg.eigvalsh(C_indef))}, 0)')
print("PASS - Sylvester's law: signature preserved under congruence")
13. Covariance Estimation
In high dimensions, the sample covariance may be ill-conditioned or singular. We compare sample, Ledoit-Wolf, and diagonal estimators.
Code cell 48
# === 13.1 Sample Covariance vs Regularized Estimators ===
np.random.seed(42)
# Small n, large p regime: n=50, p=40
n_obs, p = 50, 40
Sigma_true = np.eye(p)
# Make it correlated: Sigma = I + 0.5 * 11^T / p
v = np.ones(p) / np.sqrt(p)
Sigma_true = np.eye(p) + 0.5 * np.outer(v, v)
X = np.random.multivariate_normal(np.zeros(p), Sigma_true, size=n_obs)
# Sample covariance
S_sample = np.cov(X.T)
# Ledoit-Wolf shrinkage: (1-alpha)*S + alpha*mu*I
mu_lw = np.trace(S_sample) / p
alpha_lw = 0.3 # simplified; optimal is computed analytically
S_lw = (1-alpha_lw)*S_sample + alpha_lw*mu_lw*np.eye(p)
# Diagonal estimator
S_diag = np.diag(np.diag(S_sample))
def frobenius_error(S_hat, S_true):
return np.linalg.norm(S_hat - S_true, 'fro') / np.linalg.norm(S_true, 'fro')
print(f'n={n_obs}, p={p}')
print(f'Condition number of sample covariance: {np.linalg.cond(S_sample):.2e}')
print(f'Condition number of LW estimate: {np.linalg.cond(S_lw):.2e}')
print(f'Condition number of diagonal estimate: {np.linalg.cond(S_diag):.2e}')
print(f'\nFrobenius error (sample): {frobenius_error(S_sample, Sigma_true):.4f}')
print(f'Frobenius error (LW): {frobenius_error(S_lw, Sigma_true):.4f}')
print(f'Frobenius error (diag): {frobenius_error(S_diag, Sigma_true):.4f}')
Code cell 49
# === 13.2 Cholesky Rank-1 Update ===
# When A -> A + v v^T, update L in O(n^2) instead of O(n^3)
def cholesky_rank1_update(L, v):
"""Update Cholesky factor L such that L_new L_new^T = L L^T + v v^T.
Uses Givens rotations. Returns updated L."""
L = L.copy()
n = len(v)
x = L.T @ v # wait: actually x = solve(L, v) but let's use simple form
# Standard algorithm: process column by column using Givens
x = v.copy()
for k in range(n):
r = np.sqrt(L[k,k]**2 + x[k]**2)
c = r / L[k,k]
s = x[k] / L[k,k]
L[k, k] = r
if k < n-1:
L[k+1:, k] = (L[k+1:, k] + s * x[k+1:]) / c
x[k+1:] = c * x[k+1:] - s * L[k+1:, k]
return L
np.random.seed(42)
n = 6
A = np.random.randn(n, n)
A = A.T @ A + np.eye(n)
v = np.random.randn(n)
L_orig = np.linalg.cholesky(A)
A_updated = A + np.outer(v, v)
L_full = np.linalg.cholesky(A_updated) # full recomputation
L_upd = cholesky_rank1_update(L_orig, v) # rank-1 update
err_upd = np.max(np.abs(L_upd @ L_upd.T - A_updated))
err_full = np.max(np.abs(L_full @ L_full.T - A_updated))
print(f'Rank-1 update error: {err_upd:.2e}')
print(f'Full refactor error: {err_full:.2e}')
print(f'PASS - Rank-1 update matches full recomputation: {err_upd < 1e-10}')
Code cell 50
# === 13.3 SPD Matrix Geodesic (Log-Euclidean) ===
def matrix_log(A):
eigvals, Q = np.linalg.eigh(A)
return Q @ np.diag(np.log(eigvals)) @ Q.T
def matrix_exp(S):
eigvals, Q = np.linalg.eigh(S)
return Q @ np.diag(np.exp(eigvals)) @ Q.T
# Interpolate between two SPD matrices
A1 = np.array([[4., 1.], [1., 2.]])
A2 = np.array([[1., 0.], [0., 4.]])
print('A1 eigenvalues:', np.linalg.eigvalsh(A1))
print('A2 eigenvalues:', np.linalg.eigvalsh(A2))
# Log-Euclidean geodesic: exp((1-t)*log(A1) + t*log(A2))
ts = [0.0, 0.25, 0.5, 0.75, 1.0]
print('\nInterpolation along SPD geodesic:')
print(f'{'t':>5} {'min_eig':>10} {'trace':>8} {'is_PD':>8}')
for t in ts:
A_t = matrix_exp((1-t) * matrix_log(A1) + t * matrix_log(A2))
eigs = np.linalg.eigvalsh(A_t)
print(f'{t:>5.2f} {eigs.min():>10.4f} {np.trace(A_t):>8.4f} {str(np.all(eigs>0)):>8}')
print('\nGeodesic stays within the PSD cone (no negative eigenvalues).')
Summary
This notebook demonstrated:
| Topic | Key Result |
|---|---|
| Quadratic forms | Level sets are ellipsoids (PD), lines (PSD), or hyperbolas (indefinite) |
| Four characterizations | Quadratic form, spectral, Sylvester, Cholesky are all equivalent |
| Cholesky algorithm | , twice as fast as LU, stable without pivoting |
| LDL | Avoids square roots; pivots |
| PSD square root | Unique symmetric PSD solution via eigendecomposition |
| Whitening | ZCA and Cholesky whitening both map correlated data to identity covariance |
| Schur complement | and ; Woodbury for rank- updates |
| Log-det | Via Cholesky: ; gradient is |
| Gram matrices | ; PD kernels produce PSD Gram matrices |
| ML applications | Gaussian sampling, FIM, GP regression, loss landscape, VAE KL |
Next: Exercises notebook for hands-on practice.