Theory NotebookMath for LLMs

Matrix Decompositions

Advanced Linear Algebra / Matrix Decompositions

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Matrix Decompositions

"The purpose of computing is insight, not numbers — and no insight is more powerful than factoring a matrix into pieces whose structure you understand."

This notebook implements and explores all key algorithms from §08: LU with pivoting, Householder QR, Givens rotations, rank-revealing QR, blocked algorithms, GP inference via Cholesky, and randomized methods.

Run cells top-to-bottom for best results.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import scipy.linalg as la
from scipy import stats

try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    try:
        import seaborn as sns
        sns.set_theme(style='whitegrid', palette='colorblind')
        HAS_SNS = True
    except ImportError:
        plt.style.use('seaborn-v0_8-whitegrid')
        HAS_SNS = False
    mpl.rcParams.update({
        'figure.figsize': (10, 6),
        'figure.dpi': 120,
        'font.size': 13,
        'axes.titlesize': 15,
        'axes.labelsize': 13,
        'lines.linewidth': 2.0,
        'axes.spines.top': False,
        'axes.spines.right': False,
    })
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

COLORS = {
    'primary':   '#0077BB',
    'secondary': '#EE7733',
    'tertiary':  '#009988',
    'error':     '#CC3311',
    'neutral':   '#555555',
    'highlight': '#EE3377',
}

np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete. Libraries loaded.')

1. Triangular Systems

Triangular systems are the 'solved case' in linear algebra: forward substitution solves Lx=bL\mathbf{x}=\mathbf{b} in O(n2)O(n^2), backward substitution solves Ux=bU\mathbf{x}=\mathbf{b} in O(n2)O(n^2). All factorizations reduce to triangular solves.

Code cell 5

# === 1.1 Forward and Backward Substitution ===

def forward_sub(L, b):
    """Solve Lx=b for unit lower triangular L (no division)."""
    n = len(b)
    x = b.copy().astype(float)
    for i in range(1, n):
        x[i] -= L[i, :i] @ x[:i]
    return x

def backward_sub(U, b):
    """Solve Ux=b for upper triangular U."""
    n = len(b)
    x = b.copy().astype(float)
    for i in range(n-1, -1, -1):
        x[i] -= U[i, i+1:] @ x[i+1:]
        x[i] /= U[i, i]
    return x

# Test on a 5x5 system
np.random.seed(42)
L = np.tril(np.random.randn(5, 5))
np.fill_diagonal(L, 1.0)   # unit lower triangular
U = np.triu(np.random.randn(5, 5))
b = np.random.randn(5)

x_fwd = forward_sub(L, b)
x_bwd = backward_sub(U, b)

x_fwd_ref = la.solve_triangular(L, b, lower=True, unit_diagonal=True)
x_bwd_ref = la.solve_triangular(U, b, lower=False)

ok1 = np.allclose(x_fwd, x_fwd_ref)
ok2 = np.allclose(x_bwd, x_bwd_ref)
print(f"{'PASS' if ok1 else 'FAIL'} — Forward substitution matches scipy")
print(f"{'PASS' if ok2 else 'FAIL'} — Backward substitution matches scipy")

# Residuals
res_fwd = np.linalg.norm(L @ x_fwd - b) / np.linalg.norm(b)
res_bwd = np.linalg.norm(U @ x_bwd - b) / np.linalg.norm(b)
print(f"Forward sub residual: {res_fwd:.2e} (expect O(u) = O(1e-16))")
print(f"Backward sub residual: {res_bwd:.2e}")

2. LU Factorization

LU factorization decomposes AA into lower and upper triangular factors. Pivoting is essential — without it, small pivots cause catastrophic cancellation.

Code cell 7

# === 2.1 Naive LU Without Pivoting — Demonstrating Failure ===

def lu_naive(A):
    """Gaussian elimination WITHOUT pivoting. Returns L, U (may fail)."""
    n = A.shape[0]
    A = A.copy().astype(float)
    L = np.eye(n)
    for k in range(n-1):
        if abs(A[k, k]) < 1e-300:
            raise ValueError(f'Zero pivot at step {k}')
        L[k+1:, k] = A[k+1:, k] / A[k, k]   # multipliers
        A[k+1:, :] -= np.outer(L[k+1:, k], A[k, :])
    return L, np.triu(A)

# Example 1: Works fine for a nice matrix
A_nice = np.array([[3., 1., 2.],
                   [6., 4., 5.],
                   [9., 8., 10.]])
L_nice, U_nice = lu_naive(A_nice)
err_nice = np.linalg.norm(A_nice - L_nice @ U_nice)
print(f'Naive LU on nice matrix: ||A - LU|| = {err_nice:.2e}')

# Example 2: Catastrophic failure with tiny pivot
eps = 1e-16
A_bad = np.array([[eps, 1.], [1., 2.]])
b_bad = np.array([1., 3.])

# True solution (well-conditioned): x ≈ (1, 1)
x_true = np.linalg.solve(A_bad, b_bad)
print(f'True solution: {x_true}')

# Naive LU solution
L_bad, U_bad = lu_naive(A_bad)
y = forward_sub(L_bad, b_bad)
x_naive = backward_sub(U_bad, y)
print(f'Naive LU solution: {x_naive}')
print(f'Relative error: {np.linalg.norm(x_naive - x_true)/np.linalg.norm(x_true):.2e}')
print(f'Growth factor rho_2 = {abs(U_bad[1,1]) / abs(A_bad).max():.2e}')

Code cell 8

# === 2.2 LU With Partial Pivoting — PA = LU ===

def lu_pivot(A):
    """LU with partial pivoting. Returns P (perm array), L, U."""
    n = A.shape[0]
    A = A.copy().astype(float)
    piv = np.arange(n)
    L = np.zeros((n, n))
    np.fill_diagonal(L, 1.0)

    for k in range(n-1):
        # Find pivot
        p = k + np.argmax(np.abs(A[k:, k]))
        # Swap rows
        A[[k, p]] = A[[p, k]]
        L[[k, p], :k] = L[[p, k], :k]
        piv[[k, p]] = piv[[p, k]]
        # Compute multipliers
        if abs(A[k, k]) < 1e-14:
            continue
        L[k+1:, k] = A[k+1:, k] / A[k, k]
        A[k+1:] -= np.outer(L[k+1:, k], A[k])

    U = np.triu(A)
    return piv, L, U

# Test on the 'bad' matrix
piv, L_p, U_p = lu_pivot(A_bad)
P_mat = np.eye(2)[piv]   # permutation matrix

err_PA_LU = np.linalg.norm(P_mat @ A_bad - L_p @ U_p)
print(f'Partial pivot: ||PA - LU|| = {err_PA_LU:.2e}')

# Solve using pivoted LU
b_perm = b_bad[piv]
y = forward_sub(L_p, b_perm)
x_piv = backward_sub(U_p, y)
print(f'Pivoted LU solution: {x_piv}')
print(f'Relative error: {np.linalg.norm(x_piv - x_true)/np.linalg.norm(x_true):.2e}')

# Verify on a larger random system
n = 50
A_rand = np.random.randn(n, n)
b_rand = np.random.randn(n)
piv_r, L_r, U_r = lu_pivot(A_rand)
P_r = np.eye(n)[piv_r]
err_factorization = np.linalg.norm(P_r @ A_rand - L_r @ U_r, 'fro')
print(f'||PA - LU||_F for n=50 random: {err_factorization:.2e}')

# Compare with scipy
P_sci, L_sci, U_sci = la.lu(A_rand)
x_ref = np.linalg.solve(A_rand, b_rand)
b_p = b_rand[piv_r]
y_p = la.solve_triangular(L_r, b_p, lower=True, unit_diagonal=True)
x_p = la.solve_triangular(U_r, y_p, lower=False)
print(f'Solution relative error vs numpy: {np.linalg.norm(x_p - x_ref)/np.linalg.norm(x_ref):.2e}')

Code cell 9

# === 2.3 Growth Factor Analysis ===

def growth_factor(A):
    """Compute growth factor for LU with partial pivoting."""
    P, L, U = la.lu(A)
    # Reconstruct intermediate matrices by stepping through elimination
    max_orig = np.abs(A).max()
    # Growth factor = max entry in any intermediate / max in original
    # Approximated via scipy's internally computed LU
    return np.abs(U).max() / max_orig

# Random matrices of various sizes
sizes = [10, 20, 50, 100, 200]
gf_random = []
for n in sizes:
    rhos = [growth_factor(np.random.randn(n, n)) for _ in range(10)]
    gf_random.append(np.mean(rhos))
    print(f'n={n}: avg growth factor = {np.mean(rhos):.2f}, max = {np.max(rhos):.2f}')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(9, 5))
    ax.plot(sizes, gf_random, 'o-', color=COLORS['primary'], label='Random matrix (avg)')
    ax.plot(sizes, [n**(2/3) for n in sizes], '--', color=COLORS['secondary'],
            label=r'$n^{2/3}$ (empirical bound)')
    ax.plot(sizes, [2**(n-1) for n in sizes], ':', color=COLORS['error'],
            label=r'$2^{n-1}$ (worst case)')
    ax.set_yscale('log')
    ax.set_xlabel('Matrix size $n$')
    ax.set_ylabel('Growth factor $\\rho_n$')
    ax.set_title('LU growth factor: random vs theoretical bounds')
    ax.legend()
    fig.tight_layout()
    plt.show()
    print('Growth factor plot shown.')

Code cell 10

# === 2.4 Determinant via LU ===

def det_via_lu(A):
    """Compute det(A) = (-1)^s * prod(diag(U)) from partial-pivot LU."""
    import scipy.linalg as la
    # scipy.lu returns P, L, U with P @ A = L @ U
    P, L, U = la.lu(A)
    # Number of row swaps = n - rank of permutation
    n = A.shape[0]
    # det(P) is +1 or -1 based on parity of the permutation
    det_P = np.linalg.det(P)   # efficient: just sign
    det_U = np.prod(np.diag(U))
    return det_P * det_U

np.random.seed(42)
for n in [3, 5, 8, 10]:
    A_test = np.random.randn(n, n)
    det_lu = det_via_lu(A_test)
    det_np = np.linalg.det(A_test)
    rel_err = abs(det_lu - det_np) / abs(det_np)
    print(f'n={n}: det(LU)={det_lu:.6f}, det(numpy)={det_np:.6f}, rel_err={rel_err:.2e}')

# Special case: singular matrix
A_sing = np.array([[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]])
print(f'Singular matrix det = {det_via_lu(A_sing):.2e} (expect 0)')

3. QR Factorization: Householder Algorithm

Householder reflectors are orthogonal matrices H=I2vv/v2H = I - 2\mathbf{v}\mathbf{v}^\top/\|\mathbf{v}\|^2 that zero out subdiagonal entries. They are unconditionally backward stable — the orthogonality error Q^Q^IF=O(u)\|\hat{Q}^\top\hat{Q} - I\|_F = O(u) regardless of condition number.

Code cell 12

# === 3.1 Householder Reflection Construction ===

def householder_vec(a):
    """Compute Householder vector v such that H @ a = alpha * e_1.
    Sign convention: alpha = -sign(a[0]) * norm(a) to avoid cancellation."""
    n = len(a)
    sigma = np.linalg.norm(a)
    if sigma == 0:
        return np.zeros(n), 0.0
    alpha = -np.sign(a[0]) * sigma
    v = a.copy().astype(float)
    v[0] -= alpha         # v = a - alpha * e_1
    beta = 2.0 / (v @ v)  # 2 / ||v||^2
    return v, beta

def householder_apply(v, beta, C):
    """Apply H = I - beta*v*v^T to C implicitly. O(len(v) * C.shape[1])."""
    w = beta * (v @ C)   # row vector of inner products
    return C - np.outer(v, w)

# Test: H @ a should equal alpha * e_1
a = np.array([3., 1., 4., 1., 5.])
v, beta = householder_vec(a)
Ha = a - beta * v * (v @ a)

alpha_expected = -np.sign(a[0]) * np.linalg.norm(a)
expected = np.zeros_like(a)
expected[0] = alpha_expected

ok = np.allclose(Ha, expected, atol=1e-14)
print(f"{'PASS' if ok else 'FAIL'} — H @ a = alpha * e_1")
print(f'Ha = {Ha}')
print(f'Expected: {expected}')
print(f'Entries 1-4 are zero: {np.allclose(Ha[1:], 0)}')

# Verify H is orthogonal (implicitly)
n = 5
H_explicit = np.eye(n) - beta * np.outer(v, v)
ok_orth = np.allclose(H_explicit @ H_explicit.T, np.eye(n))
print(f"{'PASS' if ok_orth else 'FAIL'} — H is orthogonal")

Code cell 13

# === 3.2 Sign Convention — Avoiding Cancellation ===

# Show why we use alpha = -sign(a[0])*norm(a)
a_pos = np.array([10.0, 1.0, 1.0])   # a[0] positive
a_neg = np.array([-10.0, 1.0, 1.0])  # a[0] negative

print('Case: a[0] > 0')
print(f'  Good sign: v[0] = a[0] - (-norm(a)) = {a_pos[0] + np.linalg.norm(a_pos):.4f}')
print(f'  Bad sign:  v[0] = a[0] - (+norm(a)) = {a_pos[0] - np.linalg.norm(a_pos):.6f} <- cancellation!')

print('\nCase: a[0] < 0')
print(f'  Good sign: v[0] = a[0] - (+norm(a)) = {a_neg[0] + np.linalg.norm(a_neg):.6f} <- cancellation!')
print(f'  Bad sign:  v[0] = a[0] - (-norm(a)) = {a_neg[0] - (-np.linalg.norm(a_neg)):.4f}')

# Quantify the cancellation error for small a[0]
errors_good = []
errors_bad = []
eps_vals = np.logspace(-16, 0, 100)

for eps in eps_vals:
    a = np.array([eps, 1.0])  # a[0] = eps (small)
    norm_a = np.linalg.norm(a)
    # Good: alpha = -norm (since a[0] > 0)
    v_good = a[0] - (-norm_a)   # = eps + norm_a, no cancellation
    # Bad: alpha = +norm
    v_bad = a[0] - norm_a       # = eps - norm_a ≈ -1, catastrophic if eps << 1
    errors_good.append(abs(v_good - (eps + norm_a)) / max(abs(eps + norm_a), 1e-300))
    errors_bad.append(abs(v_bad - (eps - norm_a)) / max(abs(eps - norm_a), 1e-300))

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(9, 5))
    ax.semilogx(eps_vals, errors_good, color=COLORS['tertiary'], label='Good sign (no cancellation)')
    ax.semilogx(eps_vals, errors_bad, color=COLORS['error'], label='Bad sign (cancellation)')
    ax.set_xlabel('$a_1$ (size of first component)')
    ax.set_ylabel('Relative error in $v_1$')
    ax.set_title('Sign convention in Householder: avoiding cancellation')
    ax.legend()
    fig.tight_layout()
    plt.show()

Code cell 14

# === 3.3 Householder QR Algorithm ===

def householder_qr(A):
    """Compute thin QR decomposition using Householder reflectors.
    Returns Q (m x n) and R (n x n)."""
    m, n = A.shape
    A = A.copy().astype(float)
    Q = np.eye(m, n)   # accumulate Q implicitly
    reflectors = []

    for k in range(n):
        # Extract column k below diagonal
        a = A[k:, k]
        v, beta = householder_vec(a)
        reflectors.append((k, v, beta))

        # Apply H_k to A[k:, k:]
        A[k:, k:] = householder_apply(v, beta, A[k:, k:])

    # Build Q by applying reflectors in reverse
    Q = np.eye(m, n)
    for k, v, beta in reversed(reflectors):
        Q[k:, k:] = householder_apply(v, beta, Q[k:, k:])

    R = np.triu(A[:n, :])
    return Q, R

# Test on a 6x4 matrix
np.random.seed(42)
m, n = 6, 4
A_test = np.random.randn(m, n)
Q_h, R_h = householder_qr(A_test)

# Verify
err_A = np.linalg.norm(A_test - Q_h @ R_h, 'fro')
err_orth = np.linalg.norm(Q_h.T @ Q_h - np.eye(n), 'fro')
print(f'||A - QR||_F = {err_A:.2e} (expect O(u))')
print(f'||Q^T Q - I||_F = {err_orth:.2e} (expect O(u))')
ok_A = err_A < 1e-12
ok_orth = err_orth < 1e-12
print(f"{'PASS' if ok_A else 'FAIL'} — Factorization accuracy")
print(f"{'PASS' if ok_orth else 'FAIL'} — Orthogonality")

# Compare R to scipy (signs may differ)
Q_sci, R_sci = la.qr(A_test, mode='economic')
# Adjust signs
signs = np.sign(np.diag(R_sci)) * np.sign(np.diag(R_h))
R_h_signed = R_h * signs[:, None]
print(f'||R_householder - R_scipy||_F = {np.linalg.norm(np.abs(R_h) - np.abs(R_sci), "fro"):.2e}')

Code cell 15

# === 3.4 Classical Gram-Schmidt vs Householder Orthogonality ===

def gram_schmidt_classical(A):
    """Classical Gram-Schmidt (numerically unstable)."""
    m, n = A.shape
    Q = np.zeros((m, n))
    R = np.zeros((n, n))
    for j in range(n):
        v = A[:, j].astype(float)
        for i in range(j):
            R[i, j] = Q[:, i] @ A[:, j]
            v -= R[i, j] * Q[:, i]
        R[j, j] = np.linalg.norm(v)
        Q[:, j] = v / R[j, j]
    return Q, R

# Build ill-conditioned matrix: use Vandermonde-like construction
# Hilbert matrix is a classic ill-conditioned case
n_test = 12
H_hilb = np.array([[1.0/(i+j+1) for j in range(n_test)] for i in range(n_test)])
kappa = np.linalg.cond(H_hilb)
print(f'Condition number of {n_test}x{n_test} Hilbert matrix: {kappa:.2e}')

Q_cgs, _ = gram_schmidt_classical(H_hilb)
Q_house, _ = householder_qr(H_hilb)

orth_cgs = np.linalg.norm(Q_cgs.T @ Q_cgs - np.eye(n_test), 'fro')
orth_house = np.linalg.norm(Q_house.T @ Q_house - np.eye(n_test), 'fro')

print(f'Classical GS orthogonality error:  {orth_cgs:.2e}')
print(f'Householder orthogonality error:   {orth_house:.2e}')
print(f'Ratio: {orth_cgs / orth_house:.1f}x worse for classical GS')

4. Givens Rotations

A Givens rotation G(i,j,θ)G(i,j,\theta) zeros out a specific entry by rotating in the (i,j)(i,j) plane. Unlike Householder (which zeros an entire column), Givens touches only two rows — ideal for sparse matrices and streaming updates.

Code cell 17

# === 4.1 Givens Rotation Construction and Application ===

def givens_rotation(a, b):
    """Compute c, s such that [c s; -s c] [a; b] = [r; 0].
    Returns c, s, r. Numerically stable (LAPACK dlartg convention)."""
    if b == 0:
        c, s, r = 1.0, 0.0, a
    elif abs(b) > abs(a):
        t = -a / b
        s = 1.0 / np.sqrt(1 + t**2)
        c = s * t
        r = b / s
    else:
        t = -b / a
        c = 1.0 / np.sqrt(1 + t**2)
        s = c * t
        r = a / c
    return c, s, r

def apply_givens_left(G_params, A, i, j):
    """Apply Givens G(i,j) on the left to A: A[[i,j], :] updated."""
    c, s = G_params
    A = A.copy()
    row_i = c * A[i, :] - s * A[j, :]
    row_j = s * A[i, :] + c * A[j, :]
    A[i, :] = row_i
    A[j, :] = row_j
    return A

# Test: zero out entry (1,0) in a 3x3 matrix
A_g = np.array([[2., 1., 3.],
                [1., 4., 2.],
                [3., 2., 5.]])

# Zero out A[1,0] using G(0,1)
c, s, r = givens_rotation(A_g[0,0], A_g[1,0])
print(f'Givens: c={c:.4f}, s={s:.4f}, r={r:.4f}')
A_after = apply_givens_left((c, -s), A_g, 0, 1)  # G rotates by (c,-s,s,c)

# Verify
print(f'A[1,0] after rotation: {A_after[1,0]:.2e} (should be ~0)')
print(f'A[0,0] after rotation: {A_after[0,0]:.4f} (should be r={r:.4f})')

# Check orthogonality preserved (Frobenius norm)
G_mat = np.array([[c, -s, 0], [s, c, 0], [0, 0, 1.]])
ok = np.allclose(G_mat @ A_g, A_after)
print(f"{'PASS' if ok else 'FAIL'} — Givens rotation applied correctly")

Code cell 18

# === 4.2 Givens QR for Dense and Banded Matrices ===

def givens_qr(A):
    """QR via Givens rotations. Returns Q (m x m) and R (m x n)."""
    m, n = A.shape
    A = A.copy().astype(float)
    Q = np.eye(m)

    for j in range(n):
        for i in range(m-1, j, -1):  # eliminate A[i,j]
            c, s, r = givens_rotation(A[i-1, j], A[i, j])
            # Update A rows i-1, i
            row_prev = c * A[i-1, :] - s * A[i, :]
            row_curr = s * A[i-1, :] + c * A[i, :]
            A[i-1, :] = row_prev
            A[i, :] = row_curr
            # Update Q
            col_prev = c * Q[:, i-1] - s * Q[:, i]
            col_curr = s * Q[:, i-1] + c * Q[:, i]
            Q[:, i-1] = col_prev
            Q[:, i] = col_curr

    return Q, A  # A is now upper triangular = R

# Test
np.random.seed(42)
A_test = np.random.randn(5, 3)
Q_g, R_g = givens_qr(A_test)

err_A = np.linalg.norm(A_test - Q_g @ R_g, 'fro')
err_orth = np.linalg.norm(Q_g.T @ Q_g - np.eye(5), 'fro')
print(f'Givens QR: ||A - QR||_F = {err_A:.2e}')
print(f'Givens QR: ||Q^TQ - I||_F = {err_orth:.2e}')
print(f"{'PASS' if err_A < 1e-12 else 'FAIL'} — Givens QR accurate")

# Compare vs Householder for same matrix
try:
    from build_decomp_theory_p1 import householder_qr  # may not be importable
except:
    pass
Q_h_ref, R_h_ref = np.linalg.qr(A_test)
print(f'Max |R_givens| vs |R_numpy|: {np.max(np.abs(np.abs(R_g[:3]) - np.abs(R_h_ref))):.2e}')

5. Rank-Revealing QR (RRQR)

Column-pivoted QR reorders columns to expose rank structure: AP=QRAP = QR with R11R22|R_{11}| \geq |R_{22}| \geq \cdots. The diagonal of RR reveals the numerical rank — where the pivots drop.

Code cell 20

# === 5.1 Column-Pivoted QR for Rank Estimation ===

# scipy.linalg.qr with pivoting=True
np.random.seed(42)

# Build a rank-4 matrix: A = U @ S @ V^T
m_r, n_r = 20, 8
U_r = np.linalg.qr(np.random.randn(m_r, m_r))[0][:, :m_r]
V_r = np.linalg.qr(np.random.randn(n_r, n_r))[0]
# True singular values: 100, 10, 1, 0.1, 0.001, 0.001, 0.001, 0.001
sv = np.array([100., 10., 1., 0.1, 0.001, 0.001, 0.001, 0.001])
S_r = np.zeros((m_r, n_r))
np.fill_diagonal(S_r, sv[:n_r])
A_rank4 = U_r @ S_r @ V_r.T

# Column-pivoted QR
import scipy.linalg as la
Q_rrqr, R_rrqr, P_rrqr = la.qr(A_rank4, pivoting=True)
R_diag = np.abs(np.diag(R_rrqr))

print('Diagonal of |R| from RRQR:')
for i, rd in enumerate(R_diag):
    ratio = rd / R_diag[0]
    print(f'  |R[{i},{i}]| = {rd:.4f}, ratio = {ratio:.2e}')

# Rank estimation: where does the ratio drop?
tau = 1e-3
rank_est = np.sum(R_diag / R_diag[0] > tau)
true_sv = np.linalg.svd(A_rank4, compute_uv=False)
rank_svd = np.sum(true_sv / true_sv[0] > tau)
print(f'\nEstimated rank (RRQR, tau={tau}): {rank_est}')
print(f'True rank    (SVD,  tau={tau}): {rank_svd}')

Code cell 21

# === 5.2 Visualizing RRQR Rank Revelation ===

import numpy as np, scipy.linalg as la
HAS_MPL = True
try:
    import matplotlib.pyplot as plt, matplotlib as mpl
except:
    HAS_MPL = False

COLORS = {'primary': '#0077BB', 'secondary': '#EE7733', 'tertiary': '#009988',
          'error': '#CC3311', 'neutral': '#555555', 'highlight': '#EE3377'}

if HAS_MPL:
    true_sv = np.linalg.svd(A_rank4, compute_uv=False)
    R_diag2 = np.abs(np.diag(R_rrqr))

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    ax = axes[0]
    ax.semilogy(range(1, len(true_sv)+1), true_sv, 'o-',
                color=COLORS['primary'], label='True singular values')
    ax.semilogy(range(1, len(R_diag2)+1), R_diag2, 's--',
                color=COLORS['secondary'], label='|diag(R)| from RRQR')
    ax.axhline(tau * true_sv[0], color=COLORS['neutral'], linestyle=':',
               label=f'Threshold (tau={tau})')
    ax.set_xlabel('Index $k$')
    ax.set_ylabel('Value')
    ax.set_title('Rank revelation: RRQR vs SVD')
    ax.legend(fontsize=10)

    ax2 = axes[1]
    # Show original matrix and reordered matrix
    im = ax2.imshow(np.abs(R_rrqr), cmap='viridis', aspect='auto')
    plt.colorbar(im, ax=ax2, label='|R|')
    ax2.set_title('|R| from column-pivoted QR')
    ax2.set_xlabel('Column')
    ax2.set_ylabel('Row')

    fig.tight_layout()
    plt.show()
    print('RRQR visualization shown.')

6. Cholesky Factorization (Computational)

Brief computational recap — full theory in §07. Key points: twice as fast as LU, works only for SPD matrices, unconditionally backward stable, logdetA=2ilogLii\log\det A = 2\sum_i\log L_{ii}.

Code cell 23

# === 6.1 Cholesky Algorithm from Scratch ===

def cholesky_scratch(A):
    """Compute Cholesky L such that A = L L^T. Raises if not SPD."""
    n = A.shape[0]
    L = np.zeros_like(A, dtype=float)
    for j in range(n):
        # Diagonal entry
        diag_sq = A[j, j] - L[j, :j] @ L[j, :j]
        if diag_sq <= 0:
            raise ValueError(f'Matrix not positive definite at step {j}, diag_sq={diag_sq}')
        L[j, j] = np.sqrt(diag_sq)
        # Subdiagonal entries
        if j + 1 < n:
            L[j+1:, j] = (A[j+1:, j] - L[j+1:, :j] @ L[j, :j]) / L[j, j]
    return L

# Test on an SPD matrix
np.random.seed(42)
n = 6
X = np.random.randn(n, n+3)
A_spd = X @ X.T / (n+3) + np.eye(n) * 0.1  # guaranteed SPD

L_scratch = cholesky_scratch(A_spd)
L_scipy = np.linalg.cholesky(A_spd)

err_factor = np.linalg.norm(A_spd - L_scratch @ L_scratch.T, 'fro')
err_vs_scipy = np.linalg.norm(L_scratch - L_scipy, 'fro')
print(f'||A - LL^T||_F = {err_factor:.2e}')
print(f'||L_scratch - L_scipy||_F = {err_vs_scipy:.2e}')
print(f"{'PASS' if err_factor < 1e-12 else 'FAIL'} — Cholesky factorization")

# Log-determinant via Cholesky
log_det_chol = 2 * np.sum(np.log(np.diag(L_scratch)))
log_det_np = np.log(np.linalg.det(A_spd))
print(f'log det via Cholesky: {log_det_chol:.6f}')
print(f'log det via numpy:    {log_det_np:.6f}')
print(f"{'PASS' if abs(log_det_chol - log_det_np) < 1e-10 else 'FAIL'} — Log-det accuracy")

Code cell 24

# === 6.2 Cholesky vs LU: Speed and Operation Count ===

import time

sizes_perf = [50, 100, 200, 500]
times_lu = []
times_chol = []

for n in sizes_perf:
    # Build SPD matrix
    X = np.random.randn(n, n+5)
    A_spd = X @ X.T / (n+5) + np.eye(n)

    # Time LU
    t0 = time.perf_counter()
    for _ in range(10):
        la.lu_factor(A_spd)
    t_lu = (time.perf_counter() - t0) / 10

    # Time Cholesky
    t0 = time.perf_counter()
    for _ in range(10):
        la.cholesky(A_spd)
    t_chol = (time.perf_counter() - t0) / 10

    times_lu.append(t_lu * 1e3)
    times_chol.append(t_chol * 1e3)
    speedup = t_lu / t_chol
    print(f'n={n:4d}: LU={t_lu*1e3:.2f}ms, Chol={t_chol*1e3:.2f}ms, speedup={speedup:.1f}x')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(9, 5))
    ax.plot(sizes_perf, times_lu, 'o-', color=COLORS['primary'], label='LU (DGETRF)')
    ax.plot(sizes_perf, times_chol, 's--', color=COLORS['secondary'], label='Cholesky (DPOTRF)')
    ax.set_xlabel('Matrix size $n$')
    ax.set_ylabel('Time (ms)')
    ax.set_title('LU vs Cholesky factorization time (SPD matrices)')
    ax.legend()
    fig.tight_layout()
    plt.show()

7. Numerical Stability and Backward Error

The backward error theorem says: error = condition number × backward error. Backward-stable algorithms minimize backward error to O(u)O(u); the condition number κ(A)\kappa(A) then determines forward error.

Code cell 26

# === 7.1 Stability Comparison: LU vs QR for Ill-Conditioned Systems ===

import numpy as np, scipy.linalg as la

def build_ill_conditioned(n, kappa_target):
    """Build a matrix with condition number approximately kappa_target."""
    U, _ = np.linalg.qr(np.random.randn(n, n))
    V, _ = np.linalg.qr(np.random.randn(n, n))
    sv = np.logspace(0, -np.log10(kappa_target), n)
    return U @ np.diag(sv) @ V.T

np.random.seed(42)
results = []

for kappa in [1e2, 1e6, 1e10, 1e13]:
    A = build_ill_conditioned(20, kappa)
    x_true = np.ones(20)  # known true solution
    b = A @ x_true

    # LU solve
    x_lu = np.linalg.solve(A, b)   # uses LU internally
    err_lu = np.linalg.norm(x_lu - x_true) / np.linalg.norm(x_true)

    # QR solve (via lstsq — Householder QR internally)
    x_qr, _, _, _ = np.linalg.lstsq(A, b, rcond=None)
    err_qr = np.linalg.norm(x_qr - x_true) / np.linalg.norm(x_true)

    kappa_actual = np.linalg.cond(A)
    print(f'kappa={kappa:.0e}: err_LU={err_lu:.2e}, err_QR={err_qr:.2e}, kappa_actual={kappa_actual:.2e}')
    results.append((kappa_actual, err_lu, err_qr))

print('\nNote: Both LU and QR have similar forward errors for square systems.')
print('QR advantage is for LEAST SQUARES (avoids kappa^2 from normal equations).')

Code cell 27

# === 7.2 Normal Equations vs QR for Least Squares ===

np.random.seed(42)
errors_ne = []
errors_qr_ls = []
kappas = []

for kappa in np.logspace(1, 14, 30):
    m, n = 40, 20
    U, _ = np.linalg.qr(np.random.randn(m, n))
    V, _ = np.linalg.qr(np.random.randn(n, n))
    sv = np.logspace(0, -np.log10(kappa), n)
    A = U @ np.diag(sv) @ V.T
    x_true = np.random.randn(n)
    b = A @ x_true + 1e-10 * np.random.randn(m)  # tiny noise

    # Normal equations: (A^T A) x = A^T b
    try:
        ATA = A.T @ A
        ATb = A.T @ b
        L_ne = la.cholesky(ATA, lower=True)
        y = la.solve_triangular(L_ne, ATb, lower=True)
        x_ne = la.solve_triangular(L_ne.T, y)
        err_ne = np.linalg.norm(x_ne - x_true) / (np.linalg.norm(x_true) + 1e-14)
    except Exception:
        err_ne = 1.0

    # QR via lstsq
    x_qr_sol, _, _, _ = np.linalg.lstsq(A, b, rcond=None)
    err_qr_sol = np.linalg.norm(x_qr_sol - x_true) / (np.linalg.norm(x_true) + 1e-14)

    kappa_act = np.linalg.cond(A)
    kappas.append(kappa_act)
    errors_ne.append(err_ne)
    errors_qr_ls.append(err_qr_sol)

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(9, 5))
    ax.loglog(kappas, errors_ne, '.', color=COLORS['error'], alpha=0.7,
              label='Normal equations (Cholesky of $A^TA$)')
    ax.loglog(kappas, errors_qr_ls, 's', color=COLORS['tertiary'], alpha=0.7, markersize=4,
              label='QR (Householder lstsq)')
    ax.loglog(kappas, [k * 2e-16 for k in kappas], '--', color=COLORS['neutral'],
              label=r'$\kappa \cdot u$ reference')
    ax.loglog(kappas, [k**2 * 2e-16 for k in kappas], ':', color=COLORS['neutral'],
              label=r'$\kappa^2 \cdot u$ reference')
    ax.set_xlabel('Condition number $\\kappa(A)$')
    ax.set_ylabel('Relative solution error')
    ax.set_title('Normal equations vs QR for least squares')
    ax.legend(fontsize=10)
    ax.set_ylim(1e-18, 10)
    fig.tight_layout()
    plt.show()
    print('Plot: QR tracks kappa*u; normal equations track kappa^2*u.')

Code cell 28

# === 7.3 Iterative Refinement in Mixed Precision ===

np.random.seed(42)
n = 30
# Build moderately ill-conditioned system
A_ir = build_ill_conditioned(n, 1e8)
x_true_ir = np.ones(n)
b_ir = A_ir @ x_true_ir

# FP32 factorization
A32 = A_ir.astype(np.float32)
lu32, piv32 = la.lu_factor(A32.astype(np.float64))  # simulate FP32

# Initial solve
x_curr = la.lu_solve((lu32, piv32), b_ir)
print('Iterative Refinement:')
for k in range(5):
    r = b_ir - A_ir @ x_curr  # FP64 residual
    r_norm = np.linalg.norm(r) / np.linalg.norm(b_ir)
    err_x = np.linalg.norm(x_curr - x_true_ir) / np.linalg.norm(x_true_ir)
    print(f'  Step {k}: ||r||/||b|| = {r_norm:.2e}, ||x-x*||/||x*|| = {err_x:.2e}')
    if r_norm < 1e-14:
        break
    delta = la.lu_solve((lu32, piv32), r)
    x_curr = x_curr + delta

print(f'Final error: {np.linalg.norm(x_curr - x_true_ir)/np.linalg.norm(x_true_ir):.2e}')

8. Blocked Algorithms and LAPACK

Blocked algorithms reformulate factorizations to use BLAS-3 (matrix-matrix) operations instead of BLAS-2 (matrix-vector), achieving O(n)O(n) arithmetic intensity and 5-10x speedup on modern CPUs/GPUs.

Code cell 30

# === 8.1 BLAS-2 vs BLAS-3 Arithmetic Intensity ===

import time
import numpy as np

# BLAS-2: matrix-vector multiply (reads n^2 words, does 2n^2 flops -> intensity O(1))
# BLAS-3: matrix-matrix multiply (reads 2n^2 words, does 2n^3 flops -> intensity O(n))

n = 500
A = np.random.randn(n, n)
x = np.random.randn(n)
B = np.random.randn(n, n)

nrep = 20

# BLAS-2: DGEMV
t0 = time.perf_counter()
for _ in range(nrep):
    y = A @ x
t_gemv = (time.perf_counter() - t0) / nrep

# BLAS-3: DGEMM
t0 = time.perf_counter()
for _ in range(nrep):
    C = A @ B
t_gemm = (time.perf_counter() - t0) / nrep

flops_gemv = 2 * n**2
flops_gemm = 2 * n**3

gflops_gemv = flops_gemv / t_gemv / 1e9
gflops_gemm = flops_gemm / t_gemm / 1e9

print(f'n = {n}')
print(f'DGEMV (BLAS-2): {t_gemv*1e3:.2f} ms, {gflops_gemv:.1f} GFLOP/s')
print(f'DGEMM (BLAS-3): {t_gemm*1e3:.2f} ms, {gflops_gemm:.1f} GFLOP/s')
print(f'BLAS-3 / BLAS-2 throughput ratio: {gflops_gemm / gflops_gemv:.1f}x')
print('\nThis ratio grows with n — blocked algorithms exploit BLAS-3.')

Code cell 31

# === 8.2 LAPACK Routine Reference and Timing ===

import scipy.linalg as la
import time

np.random.seed(42)
n = 300
X = np.random.randn(n, n + 50)
A_spd = X @ X.T / (n + 50) + np.eye(n)
A_gen = np.random.randn(n, n)
b_vec = np.random.randn(n)

timings = {}

# DGETRF: LU with partial pivoting
t0 = time.perf_counter()
for _ in range(5):
    lu, piv = la.lu_factor(A_gen)
timings['DGETRF (LU)'] = (time.perf_counter() - t0) / 5

# DGEQRF: Householder QR
t0 = time.perf_counter()
for _ in range(5):
    Q_t, R_t = la.qr(A_gen)
timings['DGEQRF (QR)'] = (time.perf_counter() - t0) / 5

# DPOTRF: Cholesky
t0 = time.perf_counter()
for _ in range(5):
    L_t = la.cholesky(A_spd)
timings['DPOTRF (Chol)'] = (time.perf_counter() - t0) / 5

# DGETRS: triangular solve after LU
t0 = time.perf_counter()
for _ in range(20):
    x_s = la.lu_solve((lu, piv), b_vec)
timings['DGETRS (solve)'] = (time.perf_counter() - t0) / 20

print(f'Timings for n={n}:')
for name, t in timings.items():
    print(f'  {name}: {t*1e3:.2f} ms')

# Theoretical flop counts
print(f'\nTheoretical flop counts:')
print(f'  LU (2/3 n^3): {2/3 * n**3 / 1e6:.1f} Mflops')
print(f'  QR (4/3 n^3): {4/3 * n**3 / 1e6:.1f} Mflops')
print(f'  Chol (1/3 n^3): {1/3 * n**3 / 1e6:.1f} Mflops')

9. Gaussian Process Inference via Cholesky

GP regression requires solving (K+σn2I)α=y(K + \sigma_n^2 I)\boldsymbol{\alpha} = \mathbf{y} and computing logdet(K+σn2I)\log\det(K + \sigma_n^2 I) — both via Cholesky. This is the bottleneck in Bayesian optimization and uncertainty quantification.

Code cell 33

# === 9.1 GP Regression Implementation ===

import numpy as np, scipy.linalg as la

def rbf_kernel(X1, X2, ell=1.0, sf=1.0):
    """RBF/squared-exponential kernel: k(x,x') = sf^2 * exp(-||x-x'||^2 / (2*ell^2))."""
    X1 = np.atleast_2d(X1)
    X2 = np.atleast_2d(X2)
    diff = X1[:, None, :] - X2[None, :, :]  # (n1, n2, d)
    sq_dist = np.sum(diff**2, axis=-1)        # (n1, n2)
    return sf**2 * np.exp(-sq_dist / (2 * ell**2))

def gp_predict(X_train, y_train, X_test, ell=1.0, sf=1.0, sigma_n=0.1):
    """GP posterior prediction using Cholesky for all linear solves."""
    n = len(X_train)
    K_nn = rbf_kernel(X_train, X_train, ell, sf)
    K_ss = rbf_kernel(X_test, X_test, ell, sf)
    K_sn = rbf_kernel(X_test, X_train, ell, sf)

    # Noisy kernel: K + sigma_n^2 * I
    K_noisy = K_nn + sigma_n**2 * np.eye(n)

    # Cholesky factorization
    L = la.cholesky(K_noisy, lower=True)

    # Solve for alpha = (K + sigma^2 I)^{-1} y via forward+backward sub
    v = la.solve_triangular(L, y_train, lower=True)   # L v = y
    alpha = la.solve_triangular(L.T, v, lower=False)  # L^T alpha = v

    # Posterior mean: mu_* = K_*n @ alpha
    mu_star = K_sn @ alpha

    # Posterior variance: Sigma_* = K_** - K_*n (K + sigma^2 I)^{-1} K_n*
    V = la.solve_triangular(L, K_sn.T, lower=True)  # L V = K_n*
    var_star = np.diag(K_ss) - np.sum(V**2, axis=0)

    # Log marginal likelihood
    log_lik = -0.5 * (y_train @ alpha) - np.sum(np.log(np.diag(L))) - 0.5 * n * np.log(2*np.pi)

    return mu_star, np.maximum(var_star, 0), log_lik

# Generate training data from f(x) = sin(3x)
np.random.seed(42)
n_train = 40
X_train = np.linspace(0, 2*np.pi, n_train).reshape(-1, 1)
f_true = np.sin(3 * X_train.ravel())
y_train = f_true + 0.1 * np.random.randn(n_train)

X_test = np.linspace(-0.2, 2*np.pi + 0.2, 200).reshape(-1, 1)

mu_star, var_star, log_lik = gp_predict(X_train, y_train, X_test, ell=0.6, sf=1.0, sigma_n=0.1)
print(f'GP prediction computed. Log marginal likelihood: {log_lik:.3f}')
print(f'Posterior mean range: [{mu_star.min():.2f}, {mu_star.max():.2f}]')
print(f'Posterior std range: [{np.sqrt(var_star).min():.4f}, {np.sqrt(var_star).max():.4f}]')

Code cell 34

# === 9.2 GP Posterior Visualization ===

if HAS_MPL:
    COLORS = {'primary': '#0077BB', 'secondary': '#EE7733', 'tertiary': '#009988',
              'error': '#CC3311', 'neutral': '#555555', 'highlight': '#EE3377'}
    import matplotlib.pyplot as plt
    std_star = np.sqrt(var_star)
    x_plot = X_test.ravel()

    fig, ax = plt.subplots(figsize=(10, 5))
    ax.fill_between(x_plot, mu_star - 2*std_star, mu_star + 2*std_star,
                    alpha=0.2, color=COLORS['primary'], label='95% credible interval')
    ax.plot(x_plot, mu_star, color=COLORS['primary'], label='Posterior mean')
    ax.plot(x_plot, np.sin(3*x_plot), '--', color=COLORS['neutral'],
            label='True function $\\sin(3x)$')
    ax.scatter(X_train.ravel(), y_train, color=COLORS['secondary'],
               s=30, zorder=5, label='Training data')
    ax.set_xlabel('$x$')
    ax.set_ylabel('$f(x)$')
    ax.set_title('GP regression via Cholesky factorization')
    ax.legend(fontsize=10)
    fig.tight_layout()
    plt.show()
    print('GP posterior plot shown.')

Code cell 35

# === 9.3 GP Hyperparameter Optimization via Log Marginal Likelihood ===

# Grid search over (ell, sigma_n)
ell_vals = np.logspace(-1, 1, 20)
sig_vals = np.logspace(-2, 0, 20)

log_liks = np.zeros((len(ell_vals), len(sig_vals)))

for i, ell in enumerate(ell_vals):
    for j, sig in enumerate(sig_vals):
        try:
            _, _, ll = gp_predict(X_train, y_train, X_train, ell=ell, sf=1.0, sigma_n=sig)
            log_liks[i, j] = ll
        except Exception:
            log_liks[i, j] = -np.inf

best_idx = np.unravel_index(np.argmax(log_liks), log_liks.shape)
best_ell = ell_vals[best_idx[0]]
best_sig = sig_vals[best_idx[1]]
print(f'Best hyperparameters: ell={best_ell:.3f}, sigma_n={best_sig:.3f}')
print(f'Best log marginal likelihood: {log_liks[best_idx]:.3f}')

if HAS_MPL:
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(8, 6))
    im = ax.contourf(np.log10(sig_vals), np.log10(ell_vals), log_liks,
                     levels=30, cmap='viridis')
    plt.colorbar(im, ax=ax, label='Log marginal likelihood')
    ax.scatter(np.log10(best_sig), np.log10(best_ell), color='red',
               marker='*', s=200, zorder=5, label='Best')
    ax.set_xlabel('$\\log_{10}(\\sigma_n)$')
    ax.set_ylabel('$\\log_{10}(\\ell)$')
    ax.set_title('GP log marginal likelihood as function of hyperparameters')
    ax.legend()
    fig.tight_layout()
    plt.show()

10. Differentiating Through Factorizations

Modern ML requires gradients through linear solves. Implicit differentiation through Ax=bA\mathbf{x} = \mathbf{b} avoids unrolling O(n3)O(n^3) operations and uses the already-computed factorization.

Code cell 37

# === 10.1 Gradient of log det A via Cholesky ===

import numpy as np, scipy.linalg as la

def log_det_spd(A):
    """log det A for SPD A, computed stably via Cholesky."""
    L = la.cholesky(A, lower=True)
    return 2 * np.sum(np.log(np.diag(L)))

def grad_log_det_cholesky(A):
    """Gradient of log det A = A^{-1}, computed via Cholesky."""
    L = la.cholesky(A, lower=True)
    L_inv = la.solve_triangular(L, np.eye(A.shape[0]), lower=True)
    return L_inv.T @ L_inv  # = A^{-1}

np.random.seed(42)
n = 8
X = np.random.randn(n, n + 5)
A_test = X @ X.T / (n + 5) + np.eye(n) * 0.5

# Analytical gradient
grad_ana = grad_log_det_cholesky(A_test)

# Finite difference verification
eps = 1e-5
grad_fd = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        A_p = A_test.copy(); A_p[i,j] += eps
        A_m = A_test.copy(); A_m[i,j] -= eps
        grad_fd[i, j] = (log_det_spd(A_p) - log_det_spd(A_m)) / (2 * eps)

# For symmetric A, the gradient should be symmetrized
grad_fd_sym = (grad_fd + grad_fd.T) / 2

err_grad = np.linalg.norm(grad_ana - grad_fd_sym, 'fro') / np.linalg.norm(grad_ana, 'fro')
print(f'||grad_chol - grad_FD||_F / ||grad_chol||_F = {err_grad:.2e}')
print(f"{'PASS' if err_grad < 1e-4 else 'FAIL'} — Gradient of log det via Cholesky")

# Verify: grad = A^{-1}
A_inv = np.linalg.inv(A_test)
err_inv = np.linalg.norm(grad_ana - A_inv, 'fro') / np.linalg.norm(A_inv, 'fro')
print(f'||grad - A^{{-1}}||_F / ||A^{{-1}}||_F = {err_inv:.2e}')
print(f"{'PASS' if err_inv < 1e-10 else 'FAIL'} — grad log det A = A^{{-1}}")

Code cell 38

# === 10.2 Implicit Differentiation Through Solve ===

# If x = A^{-1} b and loss = f(x), then df/dA = -df/dx * x^T @ A^{-T}
# Equivalently: adjoint lambda = A^{-T} (df/dx), then df/dA = -lambda @ x^T

np.random.seed(42)
n = 10
A = np.random.randn(n, n) + n * np.eye(n)  # diag-dominant for stability
b = np.random.randn(n)

# Forward: solve A x = b
x_sol = np.linalg.solve(A, b)

# Loss: f(x) = ||x||^2
f = np.sum(x_sol**2)
df_dx = 2 * x_sol

# Adjoint: lambda = A^{-T} df_dx
lam = np.linalg.solve(A.T, df_dx)

# Gradient: df/dA = -lambda @ x^T (for square A)
grad_A_ana = -np.outer(lam, x_sol)

# Finite difference verification
eps = 1e-5
grad_A_fd = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        A_p = A.copy(); A_p[i,j] += eps
        A_m = A.copy(); A_m[i,j] -= eps
        x_p = np.linalg.solve(A_p, b)
        x_m = np.linalg.solve(A_m, b)
        grad_A_fd[i,j] = (np.sum(x_p**2) - np.sum(x_m**2)) / (2 * eps)

err = np.linalg.norm(grad_A_ana - grad_A_fd) / np.linalg.norm(grad_A_fd)
print(f'Implicit diff gradient error: {err:.2e}')
print(f"{'PASS' if err < 1e-3 else 'FAIL'} — Implicit differentiation through solve")
print('Cost: 2 triangular solves (forward + adjoint) — O(n^2) with cached factorization')

11. Randomized Factorizations

For matrices where only a low-rank approximation is needed, randomized algorithms achieve O(mnr)O(mnr) cost vs O(mn2)O(mn^2) for exact QR. Underlying LoRA, GaLore, and sketch-and-solve methods.

Code cell 40

# === 11.1 Randomized QR/SVD (Halko-Martinsson-Tropp) ===

import numpy as np, scipy.linalg as la

def randomized_svd(A, r, n_oversampling=10, n_power_iter=2):
    """Compute rank-r SVD approximation via randomized sketch.
    Halko, Martinsson, Tropp (2011) Algorithm 4.4 with power iteration."""
    m, n = A.shape
    k = r + n_oversampling

    # Step 1: Random sketch
    Omega = np.random.randn(n, k)
    Y = A @ Omega

    # Step 2: Power iteration for better approximation
    for _ in range(n_power_iter):
        Y = A @ (A.T @ Y)

    # Step 3: Orthonormalize sketch
    Q, _ = np.linalg.qr(Y)
    Q = Q[:, :k]

    # Step 4: Project and factor
    B = Q.T @ A           # (k x n) — small matrix
    Uhat, S, Vt = np.linalg.svd(B, full_matrices=False)

    # Step 5: Lift back
    U = Q @ Uhat

    return U[:, :r], S[:r], Vt[:r, :]

# Build a low-rank + noise matrix
np.random.seed(42)
m, n = 500, 300
r_true = 10
U_true = np.linalg.qr(np.random.randn(m, r_true))[0]
V_true = np.linalg.qr(np.random.randn(n, r_true))[0]
sv_true = np.exp(-np.arange(r_true) * 0.5)
A_lr = U_true @ np.diag(sv_true) @ V_true.T + 1e-3 * np.random.randn(m, n)

# Exact SVD
sv_exact = np.linalg.svd(A_lr, compute_uv=False)

# Randomized SVD
r_target = 10
U_r, S_r, Vt_r = randomized_svd(A_lr, r_target)

# Approximation error
A_approx = U_r @ np.diag(S_r) @ Vt_r
err_rand = np.linalg.norm(A_lr - A_approx, 'fro') / np.linalg.norm(A_lr, 'fro')
# Optimal error (Eckart-Young)
err_opt = np.sqrt(np.sum(sv_exact[r_target:]**2)) / np.linalg.norm(A_lr, 'fro')

print(f'Rank-{r_target} approximation:')
print(f'  Randomized SVD error: {err_rand:.4f}')
print(f'  Optimal error (Eckart-Young): {err_opt:.4f}')
print(f'  Ratio: {err_rand/err_opt:.2f}x optimal')
print(f"{'PASS' if err_rand < 2 * err_opt else 'FAIL'} — Randomized SVD near-optimal")

Code cell 41

# === 11.2 LoRA-Inspired Initialization via Randomized QR ===

# LoRA: Delta W = B A, where B in R^{d x r}, A in R^{r x k}
# Key question: how to initialize B and A for good coverage of the weight matrix?

np.random.seed(42)

# Simulate a pretrained weight matrix
d, k = 256, 128
W = np.random.randn(d, k) / np.sqrt(k)  # Xavier-like initialization

# Option 1: Standard LoRA init (B=0, A random)
r = 8
B_zero = np.zeros((d, r))
A_rand = np.random.randn(r, k) / np.sqrt(r)
Delta_W_lora = B_zero @ A_rand

# Option 2: SVD-based initialization (covers principal directions of W)
U_svd, S_svd, Vt_svd = randomized_svd(W, r)
B_svd = U_svd @ np.diag(np.sqrt(S_svd))
A_svd = np.diag(np.sqrt(S_svd)) @ Vt_svd
Delta_W_svd = B_svd @ A_svd

# How well does each capture the weight matrix?
err_lora_init = np.linalg.norm(W - Delta_W_lora, 'fro') / np.linalg.norm(W, 'fro')
err_svd_init = np.linalg.norm(W - Delta_W_svd, 'fro') / np.linalg.norm(W, 'fro')

print(f'LoRA init (B=0, A random): relative error = {err_lora_init:.4f}')
print(f'SVD init: relative error = {err_svd_init:.4f}')
print(f'Eckart-Young bound for rank-{r}: {np.sqrt(sum(np.linalg.svd(W, compute_uv=False)[r:]**2)) / np.linalg.norm(W):.4f}')
print(f'\nSVD initialization captures the low-rank structure of W at rank {r}.')
print('Standard LoRA starts from Delta W = 0 to preserve pretrained behavior.')

12. Machine Learning Applications

Factorizations appear as load-bearing mathematics in:

  • Least squares (QR for linear probing, ridge regression)
  • Second-order optimization (LU/Cholesky for Newton, K-FAC)
  • GP inference (Cholesky — already shown)
  • Gradient compression (randomized SVD/QR — GaLore, LoRA)

Code cell 43

# === 12.1 K-FAC: Kronecker-Factored Cholesky for Natural Gradient ===

import numpy as np, scipy.linalg as la

# K-FAC approximates the Fisher information matrix as:
# F_l ≈ A_l ⊗ G_l  (Kronecker product of activation and gradient covariances)
# Inverse: (A_l ⊗ G_l)^{-1} = A_l^{-1} ⊗ G_l^{-1}
# Both A_l^{-1} and G_l^{-1} computed via Cholesky

def kfac_step(A, G, gradient, damping=1e-3):
    """Compute K-FAC natural gradient step.
    A: activation covariance (d_in x d_in)
    G: gradient covariance (d_out x d_out)
    gradient: weight gradient (d_out x d_in)
    Returns natural gradient as (d_out x d_in) matrix.
    """
    d_in = A.shape[0]
    d_out = G.shape[0]

    # Add damping for numerical stability
    A_reg = A + damping * np.eye(d_in)
    G_reg = G + damping * np.eye(d_out)

    # Invert via Cholesky
    L_A = la.cholesky(A_reg, lower=True)
    L_G = la.cholesky(G_reg, lower=True)

    A_inv = la.cho_solve((L_A, True), np.eye(d_in))
    G_inv = la.cho_solve((L_G, True), np.eye(d_out))

    # Natural gradient: F^{-1} g = (A^{-1} ⊗ G^{-1}) vec(G) = G_inv @ gradient @ A_inv
    nat_grad = G_inv @ gradient @ A_inv
    return nat_grad

# Simulate one K-FAC step
np.random.seed(42)
d_in, d_out = 64, 32
batch_size = 128

# Random activations and pre-activation gradients
acts = np.random.randn(batch_size, d_in)         # activations
pre_grads = np.random.randn(batch_size, d_out)   # pre-activation gradients

# Estimate factor covariances
A_cov = (acts.T @ acts) / batch_size             # d_in x d_in
G_cov = (pre_grads.T @ pre_grads) / batch_size   # d_out x d_out

# Weight gradient
W_grad = (pre_grads.T @ acts) / batch_size       # d_out x d_in

nat_gradient = kfac_step(A_cov, G_cov, W_grad)

print(f'K-FAC natural gradient shape: {nat_gradient.shape}')
print(f'Gradient norm: {np.linalg.norm(W_grad):.4f}')
print(f'Natural gradient norm: {np.linalg.norm(nat_gradient):.4f}')

# Cholesky cost: two n^3/3 factorizations
flops_chol_A = d_in**3 / 3
flops_chol_G = d_out**3 / 3
flops_mat_mul = d_out**2 * d_in + d_out * d_in**2  # G_inv @ grad @ A_inv
print(f'\nK-FAC step flops: {flops_chol_A + flops_chol_G + flops_mat_mul:.0f}')
print(f'Full Fisher inverse would cost: {(d_in*d_out)**3:.2e} flops (infeasible)')

Code cell 44

# === 12.2 Linear Probing via QR Least Squares ===

import numpy as np, scipy.linalg as la

# Simulate: extract features from a pretrained model, fit linear head
np.random.seed(42)
n_samples = 1000  # training samples
d_feat = 256      # feature dimension
n_classes = 10    # number of classes

# Synthetic features (roughly like neural net embeddings)
X_feat = np.random.randn(n_samples, d_feat) / np.sqrt(d_feat)
# True weights (one-hot labels encoded continuously)
W_true = np.random.randn(d_feat, n_classes) / np.sqrt(d_feat)
Y = X_feat @ W_true + 0.01 * np.random.randn(n_samples, n_classes)

# Method 1: Normal equations (A^T A W = A^T Y)
XtX = X_feat.T @ X_feat
XtY = X_feat.T @ Y
W_ne = np.linalg.solve(XtX, XtY)  # solves (X^T X) W = X^T Y

# Method 2: QR-based least squares (more stable)
W_qr, _, _, _ = np.linalg.lstsq(X_feat, Y, rcond=None)

# Method 3: Ridge regression (regularized)
lam = 1e-4
W_ridge = np.linalg.solve(XtX + lam * np.eye(d_feat), XtY)

# Training residuals
res_ne = np.linalg.norm(X_feat @ W_ne - Y, 'fro') / np.linalg.norm(Y, 'fro')
res_qr = np.linalg.norm(X_feat @ W_qr - Y, 'fro') / np.linalg.norm(Y, 'fro')
res_ridge = np.linalg.norm(X_feat @ W_ridge - Y, 'fro') / np.linalg.norm(Y, 'fro')

err_ne = np.linalg.norm(W_ne - W_true, 'fro') / np.linalg.norm(W_true, 'fro')
err_qr = np.linalg.norm(W_qr - W_true, 'fro') / np.linalg.norm(W_true, 'fro')
err_ridge = np.linalg.norm(W_ridge - W_true, 'fro') / np.linalg.norm(W_true, 'fro')

print('Linear probing results:')
print(f'  Normal equations:  residual={res_ne:.4f}, solution error={err_ne:.4f}')
print(f'  QR (lstsq):       residual={res_qr:.4f}, solution error={err_qr:.4f}')
print(f'  Ridge (lambda={lam}): residual={res_ridge:.4f}, solution error={err_ridge:.4f}')
print(f'Condition number of X: {np.linalg.cond(X_feat):.2e}')

13. Summary: When to Use Which Factorization

The choice of factorization depends on matrix structure, problem type, and stability requirements.

Code cell 46

# === 13.1 Decision Table with Numerical Verification ===

import numpy as np, scipy.linalg as la

np.random.seed(42)

def make_spd(n, kappa=10):
    U, _ = np.linalg.qr(np.random.randn(n, n))
    sv = np.logspace(0, -np.log10(kappa), n)
    return U @ np.diag(sv) @ U.T

def make_general(n):
    return np.random.randn(n, n) + n * np.eye(n)

def make_overdetermined(m, n):
    return np.random.randn(m, n)

n = 50
A_spd = make_spd(n)
A_gen = make_general(n)
A_over = make_overdetermined(2*n, n)
b = np.random.randn(n)
b_long = np.random.randn(2*n)
x_true = np.ones(n)

print('=== Solving Ax = b ===')

# SPD: use Cholesky
b_spd = A_spd @ x_true
L_c = la.cholesky(A_spd, lower=True)
x_chol = la.cho_solve((L_c, True), b_spd)
print(f'Cholesky (SPD): error = {np.linalg.norm(x_chol - x_true)/np.linalg.norm(x_true):.2e}')

# General: use LU
b_gen = A_gen @ x_true
lu_f, piv_f = la.lu_factor(A_gen)
x_lu = la.lu_solve((lu_f, piv_f), b_gen)
print(f'LU partial pivot (general): error = {np.linalg.norm(x_lu - x_true)/np.linalg.norm(x_true):.2e}')

print('\n=== Solving Least Squares min ||Ax-b|| ===')

# Overdetermined: use QR
x_true_ls = np.ones(n)
b_ls = A_over @ x_true_ls + 0.01 * np.random.randn(2*n)
x_qr_ls, _, _, _ = np.linalg.lstsq(A_over, b_ls, rcond=None)
print(f'QR lstsq: error = {np.linalg.norm(x_qr_ls - x_true_ls)/np.linalg.norm(x_true_ls):.2e}')

print('\n=== Summary Table ===')
print('Matrix type       | Algorithm          | Cost      | Stability')
print('-' * 65)
print('SPD              | Cholesky (dpotrf)  | n^3/3     | Unconditional')
print('Sym. indefinite  | LDL^T (dsytrf)     | n^3/3     | Bunch-Kaufman')
print('General square   | LU + partial pivot | 2n^3/3    | Practical')
print('Overdetermined   | Householder QR     | 2mn^2     | Unconditional')
print('Rank-deficient   | RRQR or SVD        | 2mn^2     | Unconditional')

Appendix: Condition Number Estimation and Sparse Factorizations

Condition estimation and fill-in reduction for sparse systems.

Code cell 48

# === A.1 Condition Number Estimation ===

import numpy as np, scipy.linalg as la

# LAPACK DGECON: estimate condition number from LU factorization
# scipy wraps this in scipy.linalg.rcond

np.random.seed(42)
sizes_cond = [10, 50, 100, 200]

print('Condition number estimation:')
print(f'{"n":>5} | {"True kappa":>12} | {"Estimated (DGECON-like)":>22} | {"Ratio":>8}')
print('-' * 55)

for n in sizes_cond:
    # Build matrix with known condition number
    U, _ = np.linalg.qr(np.random.randn(n, n))
    V, _ = np.linalg.qr(np.random.randn(n, n))
    kappa_target = 10**np.random.uniform(2, 8)
    sv = np.logspace(0, -np.log10(kappa_target), n)
    A = U @ np.diag(sv) @ V.T

    kappa_true = np.linalg.cond(A, 2)   # exact, uses SVD
    kappa_est = 1.0 / np.linalg.matrix_rank(A, tol=None)  # placeholder
    # Use reciprocal condition (cheaper than full cond)
    rcond_est = np.linalg.lstsq(A, np.eye(n), rcond=None)[3].min() / sv[0]
    kappa_lapack = 1.0 / (np.min(la.svd(A, compute_uv=False)) / np.max(la.svd(A, compute_uv=False)))

    print(f'{n:>5} | {kappa_true:>12.2e} | {kappa_lapack:>22.2e} | {kappa_lapack/kappa_true:>8.2f}')

print('\nRule of thumb: if kappa = 10^k, lose k digits of precision in FP64 (16 digit)')

Code cell 49

# === A.2 Sparse Fill-In Demonstration ===

import numpy as np

# Sparse tridiagonal matrix: bandwidth 1
n = 10
A_tridiag = np.diag(4 * np.ones(n)) + np.diag(-1 * np.ones(n-1), 1) + np.diag(-1 * np.ones(n-1), -1)

# LU factorization
import scipy.linalg as la
P, L, U = la.lu(A_tridiag)

# Count nonzeros
tol = 1e-10
nnz_A = np.sum(np.abs(A_tridiag) > tol)
nnz_L = np.sum(np.abs(L) > tol)
nnz_U = np.sum(np.abs(U) > tol)

print(f'Tridiagonal matrix ({n}x{n}):')
print(f'  Nonzeros in A: {nnz_A} (density {nnz_A/n**2:.1%})')
print(f'  Nonzeros in L: {nnz_L} (density {nnz_L/n**2:.1%})')
print(f'  Nonzeros in U: {nnz_U} (density {nnz_U/n**2:.1%})')
print(f'  Fill-in: L+U has {nnz_L+nnz_U} nonzeros vs {nnz_A} in A')

print('\nFor tridiagonal: LU stays bidiagonal (no fill-in!).')
print('For general sparse: reordering (AMD, nested dissection) minimizes fill-in.')

# Cholesky of SPD tridiagonal (positive definite version)
A_spd_tri = A_tridiag.copy()  # already symmetric; check PD
eigmin = np.linalg.eigvalsh(A_spd_tri).min()
print(f'\nSmallest eigenvalue of tridiagonal: {eigmin:.4f} (positive -> SPD)')
L_chol = la.cholesky(A_spd_tri, lower=True)
nnz_Lchol = np.sum(np.abs(L_chol) > tol)
print(f'Cholesky L: {nnz_Lchol} nonzeros (same structure as lower band)')

Code cell 50

# === A.3 Complete Worked Example: Solving a GP System Step by Step ===

import numpy as np, scipy.linalg as la

# Complete pipeline: build kernel matrix, factor, solve, compute log-det
np.random.seed(42)

# 1. Training data
n_gp = 30
X_gp = np.linspace(0, 4*np.pi, n_gp).reshape(-1, 1)
y_gp = np.sin(X_gp.ravel()) + 0.1 * np.random.randn(n_gp)

# 2. Kernel matrix (RBF, ell=1.0, sf=1.0)
ell, sf, sigma_n = 1.0, 1.0, 0.1
diff = X_gp[:, None, :] - X_gp[None, :, :]
K = sf**2 * np.exp(-np.sum(diff**2, axis=-1) / (2 * ell**2))
K_noisy = K + sigma_n**2 * np.eye(n_gp)

print(f'Kernel matrix size: {K_noisy.shape}')
print(f'Condition number: {np.linalg.cond(K_noisy):.2e}')

# 3. Cholesky factorization
L_gp = la.cholesky(K_noisy, lower=True)
print(f'Cholesky computed. L diagonal range: [{L_gp.diagonal().min():.4f}, {L_gp.diagonal().max():.4f}]')

# 4. Solve for alpha = K^{-1} y via forward + backward substitution
v_gp = la.solve_triangular(L_gp, y_gp, lower=True)     # L v = y
alpha = la.solve_triangular(L_gp.T, v_gp, lower=False)  # L^T alpha = v

# 5. Log marginal likelihood
log_det = 2 * np.sum(np.log(L_gp.diagonal()))   # log det K = 2 sum log L_ii
data_fit = 0.5 * y_gp @ alpha
complexity = 0.5 * log_det
const = 0.5 * n_gp * np.log(2 * np.pi)
log_marg_lik = -(data_fit + complexity + const)

print(f'\nLog marginal likelihood decomposition:')
print(f'  Data fit term:   -{data_fit:.3f}')
print(f'  Complexity term: -{complexity:.3f}  (log det = {log_det:.3f})')
print(f'  Constant:        -{const:.3f}')
print(f'  Total:            {log_marg_lik:.3f}')

# 6. Verify: alpha should satisfy K_noisy @ alpha ≈ y_gp
residual = np.linalg.norm(K_noisy @ alpha - y_gp) / np.linalg.norm(y_gp)
print(f'\nResidual ||K alpha - y|| / ||y|| = {residual:.2e}')
print(f"{'PASS' if residual < 1e-10 else 'FAIL'} — GP system solved accurately via Cholesky")

Code cell 51

# === A.4 Benchmarking All Three Factorizations Together ===

import numpy as np, scipy.linalg as la, time

np.random.seed(42)

print('Final benchmark: LU vs QR vs Cholesky')
print(f'{"n":>6} | {"LU (ms)":>9} | {"QR (ms)":>9} | {"Chol (ms)":>10} | '
      f'{"Chol/LU":>8} | {"Chol theory"}' )
print('-' * 70)

for n in [50, 100, 200, 400, 800]:
    # Build SPD matrix
    X = np.random.randn(n, n + 20)
    A_spd = X @ X.T / (n+20) + np.eye(n)
    A_gen = A_spd.copy()  # also a valid general matrix

    reps = max(3, int(1000 / n))

    t0 = time.perf_counter()
    for _ in range(reps):
        la.lu_factor(A_gen)
    t_lu = (time.perf_counter() - t0) / reps * 1e3

    t0 = time.perf_counter()
    for _ in range(reps):
        la.qr(A_gen)
    t_qr = (time.perf_counter() - t0) / reps * 1e3

    t0 = time.perf_counter()
    for _ in range(reps):
        la.cholesky(A_spd)
    t_chol = (time.perf_counter() - t0) / reps * 1e3

    ratio = t_chol / t_lu
    theoretical = '~0.5' if ratio < 0.7 else f'{ratio:.2f}'
    print(f'{n:>6} | {t_lu:>9.2f} | {t_qr:>9.2f} | {t_chol:>10.2f} | '
          f'{ratio:>8.2f} | {theoretical}')

print('\nCholesky should be ~0.5x the cost of LU (symmetric exploitation).')
print('QR is more expensive than LU (orthogonalization overhead).')
print('\nAll factorization cells complete.')

Summary

This notebook covered all major matrix decompositions as computational tools:

SectionAlgorithmKey insight
1Forward/backward substitutionTriangular = solved; O(n2)O(n^2)
2LU with partial pivoting$
3Householder QRSign convention; Q^Q^I=O(u)\|\hat{Q}^\top\hat{Q}-I\|=O(u)
4Givens rotationsSelective entry zeroing; sparse/streaming
5Rank-revealing QR$
6Cholesky (computational)Half cost of LU; logdet=2logLii\log\det = 2\sum\log L_{ii}
7Backward errorForward error κ(A)×\leq \kappa(A) \times backward error
8Blocked algorithmsBLAS-3 \Rightarrow near-peak hardware throughput
9GP inferenceCholesky for K1yK^{-1}\mathbf{y} and logdetK\log\det K
10DifferentiationImplicit diff through Ax=bAx=b; O(n2)O(n^2) adjoint
11Randomized SVDO(mnr)O(mnr) near-optimal approximation (LoRA, GaLore)
12ML applicationsK-FAC, linear probing, natural gradient

Next: §04 Calculus Fundamentals where Hessians (Chapter 4 §05) will draw on Cholesky (SPD at minima) and Newton's method will require LU/Cholesky for Hessian systems.