Theory Notebook

Converted from theory.ipynb for web reading.

Singular Value Decomposition — Theory Notebook

"Every matrix tells a story of rotation, scaling, and rotation again — the SVD reads that story in full."

Interactive demonstrations: geometric action, Eckart-Young compression, four fundamental subspaces, pseudoinverse, randomised SVD, LoRA decomposition, image compression, weight matrix spectrum analysis.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import linalg as sla

try:
    import matplotlib.pyplot as plt
    import matplotlib
    plt.style.use('seaborn-v0_8-whitegrid')
    plt.rcParams['figure.figsize'] = [10, 6]
    plt.rcParams['font.size'] = 12
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

try:
    import seaborn as sns
    HAS_SNS = True
except ImportError:
    HAS_SNS = False

np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete. NumPy:', np.__version__)

1. Intuition — Geometric Action

The SVD $A = U\Sigma V^\top$ decomposes any matrix into: rotate ( $V^\top$ ), scale ( $\Sigma$ ), rotate ( $U$ ). We visualise this by watching the unit circle transform into an ellipse, with singular vectors as the axes.

Code cell 5

# === 1.1 Geometric Action: Unit Circle → Ellipse ===

A = np.array([[3., 1.], [0., 2.]])  # general 2x2
U, s, Vt = la.svd(A)

print('Matrix A:')
print(A)
print(f'\nSingular values: sigma_1={s[0]:.4f}, sigma_2={s[1]:.4f}')
print(f'Left singular vectors (cols of U):')
print(U)
print(f'Right singular vectors (rows of Vt = cols of V):')
print(Vt.T)

# Verify A = U Sigma V^T
Sigma = np.diag(s)
A_recon = U @ Sigma @ Vt
ok = np.allclose(A, A_recon)
print(f'\nPASS - A = U Sigma V^T' if ok else 'FAIL')

# Verify Av = sigma * u
for i in range(2):
    v_i = Vt[i]  # right singular vector
    u_i = U[:, i]  # left singular vector
    lhs = A @ v_i
    rhs = s[i] * u_i
    ok_i = np.allclose(lhs, rhs)
    print(f'PASS - Av_{i+1} = sigma_{i+1} * u_{i+1}' if ok_i else f'FAIL i={i}')

if HAS_MPL:
    theta = np.linspace(0, 2*np.pi, 300)
    circle = np.vstack([np.cos(theta), np.sin(theta)])
    ellipse = A @ circle

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    colors = ['red', 'green']

    # Input space: unit circle + right singular vectors
    ax = axes[0]
    ax.plot(circle[0], circle[1], 'b-', alpha=0.5, label='Unit circle')
    for i in range(2):
        v = Vt[i]
        ax.annotate('', xy=v, xytext=(0,0),
                    arrowprops=dict(arrowstyle='->', color=colors[i], lw=2.5))
        ax.text(v[0]*1.15, v[1]*1.15, f'v{i+1}', fontsize=12, color=colors[i], fontweight='bold')
    ax.set_xlim(-1.5, 1.5); ax.set_ylim(-1.5, 1.5)
    ax.set_aspect('equal')
    ax.set_title('Input space: unit circle + right singular vectors')
    ax.legend()

    # Output space: ellipse + left singular vectors (scaled)
    ax = axes[1]
    ax.plot(ellipse[0], ellipse[1], 'b-', alpha=0.5, label='A(circle)')
    for i in range(2):
        u_scaled = s[i] * U[:, i]
        ax.annotate('', xy=u_scaled, xytext=(0,0),
                    arrowprops=dict(arrowstyle='->', color=colors[i], lw=2.5))
        ax.text(u_scaled[0]*1.1, u_scaled[1]*1.1,
                f'sigma_{i+1}*u{i+1}={s[i]:.2f}', fontsize=10, color=colors[i])
    ax.set_aspect('equal')
    ax.set_title('Output space: ellipse + left singular vectors (semi-axes)')
    ax.legend()

    plt.suptitle(f'SVD: A = U Sigma V^T  |  sigma_1={s[0]:.2f}, sigma_2={s[1]:.2f}', fontsize=13)
    plt.tight_layout(); plt.show()
    print('Plot: unit circle maps to ellipse; semi-axes = singular values')

2. Formal Definitions

2.1–2.3 SVD Theorem, Singular Values, Singular Vectors

We compute the SVD via two routes: (1) directly with np.linalg.svd, (2) via the eigendecomposition of $A^\top A$ and $AA^\top$ . We verify both routes agree and confirm the four subspace relationships.

Code cell 7

# === 2.1 SVD via Eigendecomposition of A^T A ===

np.random.seed(1)
m, n = 4, 3
A = np.random.randn(m, n)

# Method 1: direct SVD
U_np, s_np, Vt_np = la.svd(A, full_matrices=False)  # thin SVD

# Method 2: via A^T A
AtA = A.T @ A  # (n x n) symmetric PSD
evals, evecs = la.eigh(AtA)  # ascending order
evals = evals[::-1]; evecs = evecs[:, ::-1]  # descending
s_eig = np.sqrt(np.maximum(evals, 0))

# Compute left singular vectors from u_i = A v_i / sigma_i
V_eig = evecs
U_eig = np.zeros((m, n))
for i in range(n):
    if s_eig[i] > 1e-10:
        U_eig[:, i] = A @ V_eig[:, i] / s_eig[i]

print('Matrix A shape:', A.shape)
print(f'\nnp.linalg.svd singular values:   {s_np.round(6)}')
print(f'via eigh(A^T A) singular values: {s_eig.round(6)}')
print(f'Match: {np.allclose(s_np, s_eig)}')

# Verify A = U Sigma V^T (method 2)
A_recon = U_eig @ np.diag(s_eig) @ V_eig.T
print(f'\n||A - U_eig Sigma V_eig^T||_F = {la.norm(A - A_recon):.2e}  (should be ~0)')

# Verify Av_i = sigma_i u_i
print('\nVerification: Av_i = sigma_i * u_i')
for i in range(n):
    lhs = A @ V_eig[:, i]
    rhs = s_eig[i] * U_eig[:, i]
    ok = np.allclose(lhs, rhs, atol=1e-10)
    print(f'  i={i+1}: PASS' if ok else f'  i={i+1}: FAIL, residual={la.norm(lhs-rhs):.2e}')

# Verify A^T u_i = sigma_i v_i
print('\nVerification: A^T u_i = sigma_i * v_i')
for i in range(n):
    lhs = A.T @ U_eig[:, i]
    rhs = s_eig[i] * V_eig[:, i]
    ok = np.allclose(lhs, rhs, atol=1e-10)
    print(f'  i={i+1}: PASS' if ok else f'  i={i+1}: FAIL')

2.4 Full vs Thin vs Truncated SVD

2.5 The Four Fundamental Subspaces from SVD

Code cell 9

# === 2.4-2.5 SVD Variants and Four Subspaces ===

# Build a rank-2 matrix in R^(4x3)
np.random.seed(5)
U0, _ = la.qr(np.random.randn(4, 4))
V0, _ = la.qr(np.random.randn(3, 3))
S0 = np.zeros((4, 3))
S0[0, 0], S0[1, 1] = 5.0, 2.0  # rank-2
A2 = U0 @ S0 @ V0.T

# Full SVD
U_full, s_full, Vt_full = la.svd(A2, full_matrices=True)
# Thin SVD
U_thin, s_thin, Vt_thin = la.svd(A2, full_matrices=False)

print('Matrix A shape:', A2.shape, '| True rank: 2')
print(f'Singular values: {s_full.round(6)}')
print(f'Numerical rank (tol=1e-10): {np.sum(s_full > 1e-10)}')
print(f'\nFull SVD: U={U_full.shape}, s={s_full.shape}, Vt={Vt_full.shape}')
print(f'Thin SVD: U={U_thin.shape}, s={s_thin.shape}, Vt={Vt_thin.shape}')

r = np.sum(s_full > 1e-10)
m_sz, n_sz = A2.shape

# Four fundamental subspaces
col_space   = U_full[:, :r]         # column space basis
left_null   = U_full[:, r:]         # left null space basis
row_space   = Vt_full[:r, :].T      # row space basis
null_space  = Vt_full[r:, :].T      # null space basis

print(f'\nFour Fundamental Subspaces of A ({m_sz}x{n_sz}, rank {r}):')
print(f'  Column space:    dim={col_space.shape[1]}  (spanned by u_1,...,u_r)')
print(f'  Left null space: dim={left_null.shape[1]}  (spanned by u_{{r+1}},...,u_m)')
print(f'  Row space:       dim={row_space.shape[1]}  (spanned by v_1,...,v_r)')
print(f'  Null space:      dim={null_space.shape[1]}  (spanned by v_{{r+1}},...,v_n)')

# Verify: A maps null space to 0
null_test = A2 @ null_space
print(f'\nA * (null space basis) = 0? max={np.abs(null_test).max():.2e}')

# Verify: null space perp row space
overlap = null_space.T @ row_space
print(f'Null space ⊥ row space? max inner product = {np.abs(overlap).max():.2e}')

# Verify: A * row_space vectors land in column space
img = A2 @ row_space
# Project onto column space and check residual
proj = col_space @ (col_space.T @ img)
print(f'A(row space) ⊆ col space? residual = {la.norm(img - proj):.2e}')

3. Computing the SVD

3.1 Numerical Stability: Direct SVD vs Squaring $A^\top A$

We demonstrate why forming $A^\top A$ destroys precision for ill-conditioned matrices.

Code cell 11

# === 3.1 Condition Number Squaring Warning ===

# Ill-conditioned matrix: sigma values span many orders of magnitude
np.random.seed(7)
n_sq = 8
true_svals = np.array([1e7, 1e5, 1e3, 1e1, 1e-1, 1e-3, 1e-5, 1e-7])
Q1, _ = la.qr(np.random.randn(n_sq, n_sq))
Q2, _ = la.qr(np.random.randn(n_sq, n_sq))
A_ill = Q1 @ np.diag(true_svals) @ Q2.T

print(f'True singular values: {true_svals}')
print(f'Condition number kappa(A) = {true_svals[0]/true_svals[-1]:.2e}')

# Method 1: direct SVD (recommended)
_, s_direct, _ = la.svd(A_ill)

# Method 2: via A^T A (NOT recommended for ill-conditioned)
AtA_ill = A_ill.T @ A_ill
evals_AtA = np.sort(la.eigvalsh(AtA_ill))[::-1]
s_via_AtA = np.sqrt(np.maximum(evals_AtA, 0))

print('\nComparison of computed vs true singular values:')
print(f'{"True":>15} {"Direct SVD":>15} {"Via A^TA":>15} {"Direct err":>12} {"AtA err":>12}')
for i in range(n_sq):
    err_d = abs(s_direct[i] - true_svals[i]) / true_svals[i]
    err_a = abs(s_via_AtA[i] - true_svals[i]) / true_svals[i]
    print(f'{true_svals[i]:15.2e} {s_direct[i]:15.2e} {s_via_AtA[i]:15.2e} {err_d:12.2e} {err_a:12.2e}')

print(f'\nkappa(A^T A) = kappa(A)^2 = {(true_svals[0]/true_svals[-1])**2:.2e}')
print('Small singular values are corrupted when forming A^T A!')
print('Always use np.linalg.svd(A) directly for numerical stability.')

3.3 Randomised SVD

The randomised SVD (Halko-Martinsson-Tropp 2011) computes the top- $k$ SVD in $O(mnk)$ time. We implement it from scratch and compare to np.linalg.svd.

Code cell 13

# === 3.3 Randomised SVD Implementation ===

def randomized_svd(A, k, n_oversampling=5, n_iter=2, random_state=42):
    """
    Randomised SVD: O(mnk) algorithm for top-k approximation.
    Halko, Martinsson, Tropp (2011).
    """
    rng = np.random.RandomState(random_state)
    m, n = A.shape
    p = k + n_oversampling

    # Step 1: Sketch
    Omega = rng.randn(n, p)  # Gaussian sketch
    Y = A @ Omega            # range sketch: (m x p)

    # Step 2: Power iteration for better approximation
    for _ in range(n_iter):
        Y = A @ (A.T @ Y)

    # Step 3: Orthogonalise
    Q, _ = la.qr(Y, mode='reduced')

    # Step 4: Project A onto low-dim space
    B = Q.T @ A  # (p x n)

    # Step 5: SVD of small matrix
    Uhat, s, Vt = la.svd(B, full_matrices=False)

    # Step 6: Recover left singular vectors
    U = Q @ Uhat

    return U[:, :k], s[:k], Vt[:k, :]


# Test on a low-rank + noise matrix
np.random.seed(0)
m_r, n_r = 300, 200
rank_true = 10

# Construct matrix with known top singular values
U0, _ = la.qr(np.random.randn(m_r, rank_true))
V0, _ = la.qr(np.random.randn(n_r, rank_true))
svals_true = np.exp(-np.arange(rank_true) * 0.5)  # decaying
A_lr = U0 @ np.diag(svals_true) @ V0.T + 0.01 * np.random.randn(m_r, n_r)

# Exact SVD (ground truth)
_, s_exact, _ = la.svd(A_lr, full_matrices=False)

# Randomised SVD
k = 15
U_rand, s_rand, Vt_rand = randomized_svd(A_lr, k=k, n_oversampling=5, n_iter=2)

# Reconstruction error
A_approx_rand = U_rand @ np.diag(s_rand) @ Vt_rand
A_approx_exact = (lambda u, s, vt: u @ np.diag(s) @ vt)(
    *la.svd(A_lr, full_matrices=False)[:3]
)
# exact rank-k
u_ex, s_ex, vt_ex = la.svd(A_lr, full_matrices=False)
A_k_exact = u_ex[:, :k] @ np.diag(s_ex[:k]) @ vt_ex[:k, :]

err_rand  = la.norm(A_lr - A_approx_rand, 'fro') / la.norm(A_lr, 'fro')
err_exact = la.norm(A_lr - A_k_exact, 'fro') / la.norm(A_lr, 'fro')

print(f'Matrix: {m_r}x{n_r}, true rank ≈ {rank_true}')
print(f'k = {k} singular values requested')
print(f'Randomised SVD relative Frobenius error: {err_rand:.4f}')
print(f'Exact truncated SVD relative error:      {err_exact:.4f}')
print(f'Singular values comparison (first {k}):')
print(f'  Exact:      {s_ex[:k].round(4)}')
print(f'  Randomised: {s_rand.round(4)}')
print(f'PASS - randomised SVD matches exact' if err_rand < err_exact * 2 else 'WARNING: high error')

4. Properties of the SVD

4.1 Norms via Singular Values

Code cell 15

# === 4.1 Matrix Norms via Singular Values ===

np.random.seed(3)
A_n = np.random.randn(5, 4)
_, s_n, _ = la.svd(A_n)

# Spectral norm = sigma_1
spectral_svd = s_n[0]
spectral_np  = la.norm(A_n, ord=2)

# Frobenius norm = sqrt(sum sigma_i^2)
frob_svd = np.sqrt(np.sum(s_n**2))
frob_np  = la.norm(A_n, 'fro')

# Nuclear norm = sum sigma_i
nuclear_svd = np.sum(s_n)
nuclear_np  = la.norm(A_n, 'nuc')

print('Matrix A shape:', A_n.shape)
print(f'Singular values: {s_n.round(4)}')
print(f'\nSpectral norm  ||A||_2 = sigma_1:')
print(f'  Via SVD:   {spectral_svd:.6f}')
print(f'  Via numpy: {spectral_np:.6f}')
print(f'  Match: {np.isclose(spectral_svd, spectral_np)}')
print(f'\nFrobenius norm ||A||_F = sqrt(sum sigma_i^2):')
print(f'  Via SVD:   {frob_svd:.6f}')
print(f'  Via numpy: {frob_np:.6f}')
print(f'  Match: {np.isclose(frob_svd, frob_np)}')
print(f'\nNuclear norm   ||A||_* = sum sigma_i:')
print(f'  Via SVD:   {nuclear_svd:.6f}')
print(f'  Via numpy: {nuclear_np:.6f}')
print(f'  Match: {np.isclose(nuclear_svd, nuclear_np)}')

# Verify norm inequalities
print(f'\nNorm inequalities:')
print(f'  ||A||_2 = {spectral_svd:.3f} <= ||A||_F = {frob_svd:.3f} <= sqrt(r)*||A||_2 = {np.sqrt(len(s_n))*spectral_svd:.3f}')
r_n = np.sum(s_n > 1e-10)
print(f'  ||A||_2 = {spectral_svd:.3f} <= ||A||_* = {nuclear_svd:.3f} <= sqrt(r)*||A||_F = {np.sqrt(r_n)*frob_svd:.3f}')
print('PASS - all norm inequalities satisfied' if spectral_svd <= frob_svd <= np.sqrt(r_n)*spectral_svd else 'FAIL')

4.2 The Eckart-Young Theorem

The rank- $k$ truncated SVD is the best rank- $k$ approximation in both spectral and Frobenius norms. We verify this experimentally and visualise the error vs rank.

Code cell 17

# === 4.2 Eckart-Young: Best Low-Rank Approximation ===

np.random.seed(10)
m_ey, n_ey = 20, 15
A_ey = np.random.randn(m_ey, n_ey)
U_ey, s_ey, Vt_ey = la.svd(A_ey, full_matrices=False)

print('Eckart-Young Theorem Verification')
print(f'Matrix: {m_ey}x{n_ey}, rank={len(s_ey)}')
print(f'{'k':>4} {'sigma_k+1':>12} {'||A-A_k||_2':>14} {'||A-A_k||_F':>14} {'Check_2':>8} {'Check_F':>8}')

errors_2, errors_F, bounds_2, bounds_F = [], [], [], []

for k in range(1, len(s_ey)):
    # Rank-k approximation
    A_k = U_ey[:, :k] @ np.diag(s_ey[:k]) @ Vt_ey[:k, :]
    err_2 = la.norm(A_ey - A_k, ord=2)
    err_F = la.norm(A_ey - A_k, 'fro')
    bound_2 = s_ey[k]  # Eckart-Young: should equal sigma_{k+1}
    bound_F = np.sqrt(np.sum(s_ey[k:]**2))

    check_2 = np.isclose(err_2, bound_2, rtol=1e-5)
    check_F = np.isclose(err_F, bound_F, rtol=1e-5)
    errors_2.append(err_2); errors_F.append(err_F)
    bounds_2.append(bound_2); bounds_F.append(bound_F)

    if k <= 6:
        print(f'{k:>4} {bound_2:>12.6f} {err_2:>14.6f} {err_F:>14.6f} {str(check_2):>8} {str(check_F):>8}')

all_check_2 = all(np.isclose(e, b, rtol=1e-5) for e, b in zip(errors_2, bounds_2))
all_check_F = all(np.isclose(e, b, rtol=1e-5) for e, b in zip(errors_F, bounds_F))
print(f'\nPASS - ||A-A_k||_2 = sigma_{{k+1}} for all k' if all_check_2 else 'FAIL spectral norm')
print(f'PASS - ||A-A_k||_F = sqrt(sum sigma_i^2, i>k) for all k' if all_check_F else 'FAIL Frobenius')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(13, 4))
    ks = range(1, len(s_ey))
    axes[0].semilogy(ks, errors_2, 'b-o', markersize=5, label='||A-A_k||_2')
    axes[0].semilogy(ks, bounds_2, 'r--', alpha=0.7, label='sigma_{k+1} (bound)')
    axes[0].set_xlabel('Rank k'); axes[0].set_ylabel('Error (log)')
    axes[0].set_title('Eckart-Young: Spectral Norm Error')
    axes[0].legend()
    axes[1].semilogy(ks, errors_F, 'b-o', markersize=5, label='||A-A_k||_F')
    axes[1].semilogy(ks, bounds_F, 'r--', alpha=0.7, label='sqrt(sum s_i^2, i>k)')
    axes[1].set_xlabel('Rank k'); axes[1].set_ylabel('Error (log)')
    axes[1].set_title('Eckart-Young: Frobenius Norm Error')
    axes[1].legend()
    plt.tight_layout(); plt.show()
    print('Plot: Eckart-Young errors match theoretical bounds exactly')

4.3 The Moore-Penrose Pseudoinverse

$A^+ = V\Sigma^+U^\top$ — inverts the nonzero singular values, zeros out the rest. We implement it, verify the four Moore-Penrose conditions, and solve least-squares.

Code cell 19

# === 4.3 Moore-Penrose Pseudoinverse via SVD ===

def pseudoinverse(A, tol=None):
    """Compute A^+ = V Sigma^+ U^T via SVD."""
    U_p, s_p, Vt_p = la.svd(A, full_matrices=False)
    if tol is None:
        tol = np.finfo(float).eps * max(A.shape) * s_p[0]
    s_inv = np.where(s_p > tol, 1.0 / s_p, 0.0)
    return Vt_p.T @ np.diag(s_inv) @ U_p.T


np.random.seed(2)
m_p, n_p = 5, 3
A_p = np.random.randn(m_p, n_p)
Ap  = pseudoinverse(A_p)
Ap_np = la.pinv(A_p)  # numpy reference

print(f'A shape: {A_p.shape}  |  A^+ shape: {Ap.shape}')
print(f'Match numpy pinv: {np.allclose(Ap, Ap_np)}')

# Verify four Moore-Penrose conditions
cond1 = np.allclose(A_p @ Ap @ A_p, A_p)
cond2 = np.allclose(Ap @ A_p @ Ap, Ap)
cond3 = np.allclose((A_p @ Ap).T, A_p @ Ap)
cond4 = np.allclose((Ap @ A_p).T, Ap @ A_p)
print(f'\nMoore-Penrose conditions:')
print(f'  (1) A A^+ A = A:         {cond1}')
print(f'  (2) A^+ A A^+ = A^+:     {cond2}')
print(f'  (3) (A A^+)^T = A A^+:   {cond3}')
print(f'  (4) (A^+ A)^T = A^+ A:   {cond4}')
print(f'PASS - all 4 conditions satisfied' if all([cond1,cond2,cond3,cond4]) else 'FAIL')

# Solve overdetermined system: min ||Ax - b||_2
b_p = np.random.randn(m_p)
x_pinv = Ap @ b_p       # pseudoinverse solution
x_lstsq = la.lstsq(A_p, b_p, rcond=None)[0]  # numpy lstsq

print(f'\nLeast squares solution:')
print(f'  ||A x_pinv - b||_2    = {la.norm(A_p @ x_pinv - b_p):.8f}')
print(f'  ||A x_lstsq - b||_2   = {la.norm(A_p @ x_lstsq - b_p):.8f}')
print(f'  Solutions match: {np.allclose(x_pinv, x_lstsq)}')

# AA^+ = projection onto column space
P_col = A_p @ Ap
print(f'\nAA^+ is orthogonal projection: symmetric={np.allclose(P_col, P_col.T)}, idempotent={np.allclose(P_col @ P_col, P_col)}')

4.5 Condition Number and Ridge Regression (Spectral Shrinkage)

Code cell 21

# === 4.5 Condition Number and Ridge Regression ===

np.random.seed(15)
n_c = 6
# Build matrix with specified condition number
true_svals_c = np.array([100., 50., 20., 5., 1., 0.1])
Q1c, _ = la.qr(np.random.randn(n_c, n_c))
Q2c, _ = la.qr(np.random.randn(n_c, n_c))
A_c = Q1c @ np.diag(true_svals_c) @ Q2c.T

kappa = true_svals_c[0] / true_svals_c[-1]
print(f'Singular values: {true_svals_c}')
print(f'Condition number kappa = sigma_1/sigma_n = {kappa:.0f}')

# True solution
x_true = np.random.randn(n_c)
b_true = A_c @ x_true
b_noisy = b_true + 0.1 * np.random.randn(n_c)  # add noise

# Pseudoinverse solution (no regularisation)
x_pinv = la.pinv(A_c) @ b_noisy

# Ridge regression with different lambda
lambdas = [0, 0.01, 0.1, 1.0, 10.0, 100.0]
errors = []
U_c, s_c, Vt_c = la.svd(A_c, full_matrices=False)

print(f'\nTrue solution ||x*||_2 = {la.norm(x_true):.4f}')
print(f'{"lambda":>10} {"||x_ridge - x*||":>18} {"Shrinkage factors":>30}')

for lam in lambdas:
    # Ridge: x = V diag(s/(s^2+lam)) U^T b
    shrink = s_c / (s_c**2 + lam)
    x_ridge = Vt_c.T @ np.diag(shrink) @ U_c.T @ b_noisy
    err = la.norm(x_ridge - x_true)
    errors.append(err)
    shrink_str = ' '.join([f'{s:.3f}' for s in shrink])
    print(f'{lam:>10.3g} {err:>18.6f} [{shrink_str}]')

best_lam = lambdas[np.argmin(errors)]
print(f'\nBest lambda = {best_lam} (min error = {min(errors):.6f})')
print(f'Unregularised error (lambda=0) = {errors[0]:.6f}')

if HAS_MPL:
    plt.figure(figsize=(8, 4))
    plt.semilogx([l + 1e-5 for l in lambdas], errors, 'bo-', markersize=8)
    plt.xlabel('Regularisation lambda'); plt.ylabel('||x_ridge - x*||_2')
    plt.title(f'Ridge Regression: Error vs Lambda (kappa={kappa:.0f})')
    plt.tight_layout(); plt.show()
    print('Plot: ridge regression error vs lambda (spectral shrinkage)')

4.6 The Orthogonal Procrustes Problem

Find the orthogonal matrix $R$ that best aligns $A$ to $B$ : $\hat{R} = UV^\top$ from SVD $(BA^\top)$ .

Code cell 23

# === 4.6 Orthogonal Procrustes Problem ===

np.random.seed(20)
d = 4
n_pts = 20

# Generate a point cloud A and apply a (noisy) rotation to get B
A_proc = np.random.randn(n_pts, d)  # (n x d): n points in d dimensions
R_true, _ = la.qr(np.random.randn(d, d))  # true orthogonal transform
if la.det(R_true) < 0:  # ensure proper rotation
    R_true[:, 0] *= -1
noise = 0.1 * np.random.randn(n_pts, d)
B_proc = A_proc @ R_true.T + noise  # B ≈ A R^T

# Procrustes: find R* = argmin ||R A^T - B^T||_F  or  ||A R - B||_F
# We want min ||R A - B||_F in the (d x n) sense
# = min ||A^T R^T - B^T||_F  => compute SVD of B^T A = BA^T in (n x n) ... 
# Standard form: min ||RA - B||_F, columns are points
# Optimal R = UV^T from SVD of B A^T
M = B_proc.T @ A_proc  # (d x d)
U_proc, s_proc, Vt_proc = la.svd(M)
R_hat = U_proc @ Vt_proc

# Ensure proper rotation (det = +1)
if la.det(R_hat) < 0:
    U_proc[:, -1] *= -1
    R_hat = U_proc @ Vt_proc

print(f'True R: det = {la.det(R_true):.4f}')
print(f'Estimated R_hat: det = {la.det(R_hat):.4f}')

# Compare: angle between true and estimated rotation
angle_diff = la.norm(R_hat - R_true, 'fro') / la.norm(R_true, 'fro')
print(f'Relative error ||R_hat - R_true||_F / ||R_true||_F = {angle_diff:.4f}')

# Procrustes gives minimum alignment error
err_procrustes = la.norm(A_proc @ R_hat.T - B_proc, 'fro')
err_true_R     = la.norm(A_proc @ R_true.T - B_proc, 'fro')
print(f'\nAlignment error with Procrustes R: {err_procrustes:.4f}')
print(f'Alignment error with true R:        {err_true_R:.4f}')
print(f'Procrustes <= true (as expected): {err_procrustes <= err_true_R + 1e-10}')
print('PASS - Procrustes finds minimum alignment' if err_procrustes <= err_true_R + 1e-8 else 'FAIL')

5. Outer Product Form — Rank-1 Decomposition

Every matrix is a sum of rank-1 matrices: $A = \sum_i \sigma_i \mathbf{u}_i\mathbf{v}_i^\top$ . We visualise this for an image by showing the contribution of each term.

Code cell 25

# === 5.1 Rank-1 Outer Product Decomposition (Synthetic Image) ===

# Generate a structured synthetic 'image'
np.random.seed(99)
sz = 64
x = np.linspace(0, 4*np.pi, sz)
# Low-frequency pattern (dominant) + medium + noise
img = (np.outer(np.sin(x), np.cos(x)) * 10 +
       np.outer(np.sin(2*x), np.sin(2*x)) * 3 +
       np.random.randn(sz, sz) * 0.5)

U_img, s_img, Vt_img = la.svd(img, full_matrices=False)

print(f'Image: {sz}x{sz}')
print(f'Singular values (first 8): {s_img[:8].round(3)}')
print(f'Total Frobenius norm squared = sum sigma_i^2: {np.sum(s_img**2):.2f}')

# Reconstruction at different ranks
ranks = [1, 3, 5, 10, 20]
print(f'\n{"Rank":>6} {"% Frob energy":>16} {"||img - img_k||_F":>20}')
for k in ranks:
    img_k = U_img[:, :k] @ np.diag(s_img[:k]) @ Vt_img[:k, :]
    energy = 100 * np.sum(s_img[:k]**2) / np.sum(s_img**2)
    err = la.norm(img - img_k, 'fro')
    print(f'{k:>6} {energy:>16.2f}% {err:>20.4f}')

if HAS_MPL:
    fig, axes = plt.subplots(2, 3, figsize=(14, 8))
    axes[0, 0].imshow(img, cmap='gray')
    axes[0, 0].set_title('Original image')
    axes[0, 0].axis('off')

    for idx, k in enumerate([1, 3, 5, 10, 20]):
        ax = axes[(idx+1)//3, (idx+1)%3]
        img_k = U_img[:, :k] @ np.diag(s_img[:k]) @ Vt_img[:k, :]
        energy = 100 * np.sum(s_img[:k]**2) / np.sum(s_img**2)
        ax.imshow(img_k, cmap='gray')
        ax.set_title(f'Rank {k} ({energy:.1f}% energy)')
        ax.axis('off')

    plt.suptitle('SVD Image Compression: Rank-k Approximations', fontsize=13)
    plt.tight_layout(); plt.show()
    print('Plot: rank-k image approximations showing progressive reconstruction')

6. SVD in Machine Learning and AI

6.1 LoRA — Low-Rank Adaptation via SVD

Code cell 27

# === 6.1 LoRA: Low-Rank Adaptation ===

np.random.seed(42)

# Simulate: pre-trained weight W0, true fine-tuned update DeltaW (low rank)
d_out, d_in = 128, 64
r_true = 4  # true intrinsic rank of fine-tuning update

# Construct true low-rank delta W
U_dw, _ = la.qr(np.random.randn(d_out, r_true))
V_dw, _ = la.qr(np.random.randn(d_in, r_true))
s_dw = np.array([5., 3., 2., 1.])  # singular values of delta W
DeltaW_true = U_dw @ np.diag(s_dw) @ V_dw.T  # rank-4 matrix

print(f'DeltaW shape: {DeltaW_true.shape}')
print(f'DeltaW true rank: {np.linalg.matrix_rank(DeltaW_true)}')
print(f'DeltaW singular values: {np.linalg.svd(DeltaW_true, compute_uv=False)[:8].round(4)}')

# LoRA decompose at rank r
U_lo, s_lo, Vt_lo = la.svd(DeltaW_true, full_matrices=False)

print(f'\nLoRA approximation error vs rank r:')
print(f'{"r":>4} {"Params":>10} {"Rel error":>12} {"Exact?"}') 
for r in [1, 2, 3, 4, 5, 8, 16]:
    r_use = min(r, len(s_lo))
    B = U_lo[:, :r_use] @ np.diag(s_lo[:r_use])
    A_lora = Vt_lo[:r_use, :]
    DeltaW_approx = B @ A_lora
    rel_err = la.norm(DeltaW_approx - DeltaW_true, 'fro') / la.norm(DeltaW_true, 'fro')
    params = d_out * r_use + r_use * d_in
    exact = rel_err < 1e-10
    print(f'{r:>4} {params:>10} {rel_err:>12.4e} {"YES (exact)" if exact else ""}')

# LoRA parameter savings
r_lora = 4
params_full = d_out * d_in
params_lora = d_out * r_lora + r_lora * d_in
print(f'\nFull delta W parameters: {params_full:,}')
print(f'LoRA (r={r_lora}) parameters: {params_lora:,}')
print(f'Compression ratio: {params_full/params_lora:.1f}x')
print(f'At r={r_true}: perfect recovery (rank-{r_true} true update)')

6.7 Weight Matrix Singular Value Spectrum Analysis (WeightWatcher)

Code cell 29

# === 6.7 WeightWatcher: Singular Value Spectrum Analysis ===

np.random.seed(55)

def simulate_weight_matrix(d_out, d_in, alpha, noise_level=0.1, seed=0):
    """
    Simulate a weight matrix with power-law singular value spectrum.
    sigma_i ~ i^(-1/alpha)
    """
    rng = np.random.RandomState(seed)
    n_svs = min(d_out, d_in)
    # Power-law singular values
    svals = np.array([k**(-1.0/alpha) for k in range(1, n_svs+1)])
    svals = svals / svals[0]  # normalise
    Q1, _ = la.qr(rng.randn(d_out, d_out))
    Q2, _ = la.qr(rng.randn(d_in, d_in))
    S = np.zeros((d_out, d_in))
    for i in range(n_svs):
        S[i, i] = svals[i]
    W = Q1 @ S @ Q2.T + noise_level * rng.randn(d_out, d_in)
    return W

# Simulate three 'layers' with different training quality
configs = [
    ('Well-trained (alpha=2.5)', 2.5),
    ('Under-trained (flat, alpha=8)', 8.0),
    ('Over-trained (alpha=1.2)', 1.2),
]

d = 64
for name, alpha_sim in configs:
    W = simulate_weight_matrix(d, d, alpha_sim, noise_level=0.02)
    svals_W = la.svd(W, compute_uv=False)

    # Fit power law to tail of spectrum
    # log(sigma_i) = -hat_alpha * log(i) + const
    idx = np.arange(1, len(svals_W)+1)
    log_idx = np.log(idx)
    log_sv  = np.log(np.maximum(svals_W, 1e-10))
    # Linear fit on middle section (avoid very top and very bottom)
    fit_range = slice(len(svals_W)//4, 3*len(svals_W)//4)
    coeffs = np.polyfit(log_idx[fit_range], log_sv[fit_range], 1)
    hat_alpha = -coeffs[0]  # negative slope = alpha

    stable_rank = la.norm(W, 'fro')**2 / la.norm(W, ord=2)**2
    print(f'{name}:')
    print(f'  Fitted alpha: {hat_alpha:.2f} (target {alpha_sim:.1f})')
    print(f'  Stable rank: {stable_rank:.1f}')
    print(f'  sigma_1 = {svals_W[0]:.3f}, sigma_10 = {svals_W[9]:.3f}, sigma_n = {svals_W[-1]:.4f}')
    print()

if HAS_MPL:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    for ax, (name, alpha_sim) in zip(axes, configs):
        W = simulate_weight_matrix(d, d, alpha_sim, noise_level=0.02)
        svals_W = la.svd(W, compute_uv=False)
        idx = np.arange(1, len(svals_W)+1)
        ax.loglog(idx, svals_W, 'b.', markersize=4, alpha=0.7)
        # Fit line
        log_idx = np.log(idx); log_sv = np.log(np.maximum(svals_W, 1e-10))
        fit_range = slice(len(svals_W)//4, 3*len(svals_W)//4)
        coeffs = np.polyfit(log_idx[fit_range], log_sv[fit_range], 1)
        hat_alpha = -coeffs[0]
        ax.loglog(idx, np.exp(np.polyval(coeffs, log_idx)), 'r--',
                  label=f'fit alpha={hat_alpha:.1f}')
        ax.set_xlabel('Singular value index'); ax.set_ylabel('Sigma_i (log)')
        ax.set_title(name[:25]); ax.legend(fontsize=9)
    plt.suptitle('WeightWatcher: Singular Value Spectra', fontsize=13)
    plt.tight_layout(); plt.show()
    print('Plot: power-law fits to singular value spectra (WeightWatcher)')

7. Practical: Randomised vs Exact SVD Comparison

We benchmark our randomised SVD implementation against exact SVD across different matrix sizes and ranks, showing the computational advantage.

Code cell 31

# === 7.3 Randomised SVD: Accuracy vs Rank ===

import time

np.random.seed(77)
m_bench, n_bench = 500, 400

# Matrix with exponentially decaying singular values
U_b, _ = la.qr(np.random.randn(m_bench, m_bench))
V_b, _ = la.qr(np.random.randn(n_bench, n_bench))
n_sv = min(m_bench, n_bench)
true_s_bench = np.exp(-np.arange(n_sv) / 20.0)  # slow decay
S_bench = np.zeros((m_bench, n_bench))
for i in range(n_sv):
    S_bench[i, i] = true_s_bench[i]
A_bench = U_b @ S_bench @ V_b.T

# Exact SVD timing
t0 = time.time()
U_ex_b, s_ex_b, Vt_ex_b = la.svd(A_bench, full_matrices=False)
t_exact = time.time() - t0

print(f'Matrix: {m_bench}x{n_bench}')
print(f'Exact SVD time: {t_exact*1000:.1f} ms')
print(f'\nRandomised SVD accuracy vs k (n_iter=2):')
print(f'{"k":>6} {"Time (ms)":>12} {"Rel Frob error":>16} {"Speedup":>10}')

for k in [5, 10, 20, 50, 100]:
    t0 = time.time()
    U_r, s_r, Vt_r = randomized_svd(A_bench, k=k, n_oversampling=5, n_iter=2)
    t_rand = time.time() - t0

    A_r_approx = U_r @ np.diag(s_r) @ Vt_r
    A_k_exact  = U_ex_b[:, :k] @ np.diag(s_ex_b[:k]) @ Vt_ex_b[:k, :]
    err_r = la.norm(A_bench - A_r_approx, 'fro') / la.norm(A_bench, 'fro')
    err_e = la.norm(A_bench - A_k_exact, 'fro') / la.norm(A_bench, 'fro')

    speedup = t_exact / (t_rand + 1e-9)
    print(f'{k:>6} {t_rand*1000:>12.1f} {err_r:>16.4f} {speedup:>10.1f}x  (exact_k error={err_e:.4f})')

print(f'\nKey takeaway: randomised SVD achieves near-exact error in much less time')

Summary

Section	Key result
1. Geometry	Unit sphere → ellipsoid; semi-axes = singular values
2. Definitions	$A = U\Sigma V^\top$ ; $\sigma_i = \sqrt{\lambda_i(A^\top A)}$ ; four subspaces
3. Computing	Never form $A^\top A$ ; Golub-Reinsch; randomised SVD $O(mnk)$
4. Properties	Norms = functions of $\sigma_i$ ; Eckart-Young; pseudoinverse; ridge shrinkage
5. Outer products	$A = \sum_i\sigma_i\mathbf{u}_i\mathbf{v}_i^\top$ ; image compression
6. AI	LoRA = low-rank SVD of $\Delta W$ ; WeightWatcher = power-law spectrum
7. Practice	Randomised SVD: $10\text{–}50\times$ speedup with <1% extra error

Next: exercises.ipynb — 8 graded problems.

Then: 03-03 Principal Component Analysis — PCA is the SVD of the centred data matrix.

Singular Value Decomposition