Theory Notebook
Converted from
theory.ipynbfor web reading.
Orthogonality and Orthonormality
"Pythagoras could have predicted deep learning: orthogonality is not just a geometric nicety — it is the reason gradient-based optimization works."
Interactive theory notebook for §05 Orthogonality and Orthonormality. Covers inner products, Gram-Schmidt, QR decomposition, orthogonal projections, the spectral theorem, and their applications in modern machine learning.
Companion notes: notes.md | Exercises: exercises.ipynb
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
from scipy.linalg import qr, polar, solve_triangular
try:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [10, 5]
plt.rcParams['font.size'] = 12
HAS_MPL = True
except ImportError:
HAS_MPL = False
try:
import seaborn as sns
HAS_SNS = True
except ImportError:
HAS_SNS = False
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
def check(name, got=None, expected=None, cond=None, tol=1e-8):
if cond is not None:
ok = cond
else:
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} -- {name}")
return ok
print('Setup complete. NumPy', np.__version__)
1. Intuition: Why Orthogonality Matters
Orthogonality is the single most powerful structure in linear algebra. When vectors are orthogonal, computations that would otherwise require matrix inversions collapse to simple inner products. This section demonstrates the key computational miracles that orthogonality enables.
Code cell 5
# === 1.1 Orthogonality: Geometric Demonstration ===
# In a non-orthogonal basis, expressing a vector requires solving a system
v = np.array([3.0, 2.0])
b1 = np.array([1.0, 0.5]) # non-orthogonal basis
b2 = np.array([0.5, 1.0])
B = np.column_stack([b1, b2])
coords_nonorth = la.solve(B, v) # requires matrix inverse
print(f'Non-orthogonal: solve Bc = v requires O(n^3) work')
print(f' Coordinates: {coords_nonorth}')
# In an orthonormal basis, it's just inner products (O(n) work)
q1 = np.array([1.0, 0.0])
q2 = np.array([0.0, 1.0])
c1 = v @ q1 # just an inner product
c2 = v @ q2
print(f'\nOrthonormal: inner products only')
print(f' Coordinates: ({c1}, {c2})')
print(f' Reconstruction: {c1*q1 + c2*q2}')
# Verify both give same reconstruction
check('both bases reconstruct v', B @ coords_nonorth, v)
check('ONB reconstructs v', c1*q1 + c2*q2, v)
print('\nKey insight: ONB makes coordinates = inner products (no matrix inversion needed)')
Code cell 6
# === 1.2 Cauchy-Schwarz Inequality Visualization ===
# Demonstrate |<u,v>| <= ||u|| ||v|| for many random pairs
n_tests = 1000
dim = 10
ratios = []
for _ in range(n_tests):
u = np.random.randn(dim)
v = np.random.randn(dim)
lhs = abs(u @ v)
rhs = la.norm(u) * la.norm(v)
ratios.append(lhs / rhs) # should always be <= 1
ratios = np.array(ratios)
print(f'Cauchy-Schwarz test over {n_tests} random pairs:')
print(f' Max ratio |<u,v>| / (||u|| ||v||) = {ratios.max():.8f} (should be ≤ 1)')
print(f' Mean ratio = {ratios.mean():.4f}')
check('Cauchy-Schwarz: all ratios <= 1', cond=bool(np.all(ratios <= 1 + 1e-10)))
# When does equality hold? When u and v are parallel
u_eq = np.array([1.0, 2.0, 3.0])
v_eq = 2.5 * u_eq # parallel to u
ratio_eq = abs(u_eq @ v_eq) / (la.norm(u_eq) * la.norm(v_eq))
print(f'\nEquality case (parallel vectors): ratio = {ratio_eq:.10f} (should be 1.0)')
Code cell 7
# === 1.3 Inner Product Encodes Angle ===
# cos(theta) = <u,v> / (||u|| ||v||)
test_cases = [
(np.array([1,0,0]), np.array([0,1,0]), 'e1, e2 (perpendicular)'),
(np.array([1,0,0]), np.array([1,1,0])/np.sqrt(2), 'e1, 45deg from e1'),
(np.array([1,0,0]), np.array([-1,0,0]), 'e1, -e1 (antiparallel)'),
(np.array([1,1,0])/np.sqrt(2), np.array([1,1,1])/np.sqrt(3), 'diag2, diag3'),
]
print('Vector pair cos(θ) angle')
print('-' * 60)
for u, v, name in test_cases:
cos_t = u @ v / (la.norm(u) * la.norm(v))
angle_deg = np.degrees(np.arccos(np.clip(cos_t, -1, 1)))
print(f'{name:<40} {cos_t:7.4f} {angle_deg:6.1f}°')
2. Formal Definitions
2.1–2.5: Cauchy-Schwarz, Orthonormality, Parseval, Complements
We verify the key identities numerically.
Code cell 9
# === 2.1 Cauchy-Schwarz and Norm Identities ===
# Pythagorean theorem
u = np.array([3.0, 0.0])
v = np.array([0.0, 4.0])
print(f'Pythagorean theorem: u⊥v -> ||u+v||² = ||u||² + ||v||²')
print(f' ||u+v||² = {la.norm(u+v)**2:.2f}')
print(f' ||u||² + ||v||² = {la.norm(u)**2 + la.norm(v)**2:.2f}')
check('Pythagorean theorem', la.norm(u+v)**2, la.norm(u)**2 + la.norm(v)**2)
# Parseval's identity with standard basis (trivial ONB)
v_test = np.array([3.0, 1.0, 4.0, 1.0, 5.0])
Q_std = np.eye(5) # standard ONB
coeffs = Q_std.T @ v_test
parseval_sum = np.sum(coeffs**2)
print(f'\nParseval identity: ||v||² = Σᵢ <v,qᵢ>²')
print(f' ||v||² = {la.norm(v_test)**2:.2f}')
print(f' Σᵢ |<v,qᵢ>|² = {parseval_sum:.2f}')
check('Parseval identity (standard basis)', parseval_sum, la.norm(v_test)**2)
# Parseval with a random ONB
M = np.random.randn(5, 5)
Q_rand, _ = la.qr(M) # random ONB
coeffs_rand = Q_rand.T @ v_test
check('Parseval identity (random ONB)', np.sum(coeffs_rand**2), la.norm(v_test)**2)
Code cell 10
# === 2.5 Orthogonal Decomposition ===
# Decompose v into S component and S-perp component
# S = span{q1, q2} in R^4
q1 = np.array([1., 1., 0., 0.]) / np.sqrt(2)
q2 = np.array([0., 0., 1., 1.]) / np.sqrt(2)
Q_S = np.column_stack([q1, q2]) # 4x2 matrix, orthonormal columns
v = np.array([1., 2., 3., 4.])
# Projection onto S
v_S = Q_S @ (Q_S.T @ v) # = sum_i <v,qi> qi
# Component in S-perp
v_Sperp = v - v_S
print('Orthogonal Decomposition: v = v_S + v_S⊥')
print(f' v = {v}')
print(f' v_S = {v_S}')
print(f' v_S⊥ = {v_Sperp}')
print(f' Sum = {v_S + v_Sperp}')
check('Reconstruction: v_S + v_Sperp = v', v_S + v_Sperp, v)
check('v_Sperp ⊥ q1', cond=abs(v_Sperp @ q1) < 1e-12)
check('v_Sperp ⊥ q2', cond=abs(v_Sperp @ q2) < 1e-12)
# dim check
print(f'\ndim(S) = 2, dim(S⊥) = 2, dim(R^4) = 4')
print(f'dim(S) + dim(S⊥) = {2 + 2} = dim(R^4) ✓')
# Pythagorean theorem for the decomposition
check('||v||² = ||v_S||² + ||v_S⊥||²',
la.norm(v)**2, la.norm(v_S)**2 + la.norm(v_Sperp)**2)
3. Orthogonal Matrices
3.1–3.5: Definition, O(n), Condition Number, Eigenvalues, Cayley Transform
Code cell 12
# === 3.1 Orthogonal Matrix Properties ===
# Rotation matrix in 2D
theta = np.pi / 3 # 60 degrees
Q_rot = np.array([[np.cos(theta), -np.sin(theta)],
[np.sin(theta), np.cos(theta)]])
print('2D Rotation matrix (theta = pi/3):')
print(Q_rot)
print(f'\nVerifying orthogonality:')
check('Q^T Q = I', Q_rot.T @ Q_rot, np.eye(2))
check('Q Q^T = I', Q_rot @ Q_rot.T, np.eye(2))
check('det(Q) = 1', cond=abs(la.det(Q_rot) - 1) < 1e-12)
# Isometry: preserves norms
x = np.array([3.0, 4.0])
Qx = Q_rot @ x
print(f'\nIsometry test:')
print(f' ||x|| = {la.norm(x):.6f}')
print(f' ||Qx|| = {la.norm(Qx):.6f}')
check('||Qx|| = ||x||', la.norm(Qx), la.norm(x))
# Preserves inner products
y = np.array([1.0, 2.0])
Qy = Q_rot @ y
check('<Qx, Qy> = <x, y>', Qx @ Qy, x @ y)
# Condition number
kappa_Q = la.cond(Q_rot)
print(f'\nCondition number of Q_rot: {kappa_Q:.6f} (should be 1.0)')
Code cell 13
# === 3.2 Orthogonal Group O(n) ===
# Generate random orthogonal matrices via QR
def random_orthogonal(n, seed=None):
rng = np.random.default_rng(seed)
M = rng.standard_normal((n, n))
Q, R = la.qr(M)
Q *= np.sign(np.diag(R)) # ensure uniform distribution
return Q
Q1 = random_orthogonal(4, seed=1)
Q2 = random_orthogonal(4, seed=2)
print('Closure under multiplication:')
Q3 = Q1 @ Q2
check('Q1 @ Q2 is orthogonal', Q3.T @ Q3, np.eye(4))
print('\nInverse = transpose:')
check('Q^{-1} = Q^T', la.inv(Q1), Q1.T)
print('\nDeterminant is ±1:')
for i, Q in enumerate([Q1, Q2, Q3]):
d = la.det(Q)
print(f' det(Q{i+1}) = {d:.6f}')
# Reflection (det = -1)
Q_refl = np.diag([-1., 1., 1., 1.]) # flip first coordinate
check('Reflection is orthogonal', Q_refl.T @ Q_refl, np.eye(4))
print(f' det(Q_refl) = {la.det(Q_refl):.1f} (reflection: det = -1)')
4. Orthogonal Projections
Orthogonal projection is the fundamental operation connecting orthogonality to computation. Every least-squares problem, every PCA step, and every attention head involves a projection.
Code cell 15
# === 4.1 Orthogonal Projection Formula ===
# Projection onto a 1D subspace (unit vector)
u_hat = np.array([1., 1., 0.]) / np.sqrt(2) # unit vector
v = np.array([2., 1., 3.])
proj_v = (v @ u_hat) * u_hat
print(f'Projection of v = {v}')
print(f'onto u_hat = {u_hat}')
print(f'proj = {proj_v}')
print(f'coefficient <v, u_hat> = {v @ u_hat:.4f}')
# Verify orthogonality of residual
residual = v - proj_v
print(f'\nResidue v - proj = {residual}')
print(f'<residue, u_hat> = {residual @ u_hat:.2e} (should be ~0)')
check('Residual ⊥ u_hat', cond=abs(residual @ u_hat) < 1e-12)
# Projection matrix P = u_hat u_hat^T
P1D = np.outer(u_hat, u_hat)
check('P idempotent: P^2 = P', P1D @ P1D, P1D)
check('P symmetric: P^T = P', P1D.T, P1D)
check('P v = proj_v', P1D @ v, proj_v)
Code cell 16
# === 4.2 Projection onto Higher-Dimensional Subspace ===
# Build an ONB for a 2D subspace of R^4
a1 = np.array([1., 1., 0., 0.])
a2 = np.array([0., 1., 1., 0.])
# Gram-Schmidt to get ONB
q1 = a1 / la.norm(a1)
u2 = a2 - (a2 @ q1) * q1
q2 = u2 / la.norm(u2)
Q_sub = np.column_stack([q1, q2]) # 4x2 with Q^T Q = I
check('Q^T Q = I2', Q_sub.T @ Q_sub, np.eye(2))
# Project v onto subspace
v = np.array([1., 2., 3., 4.])
P_sub = Q_sub @ Q_sub.T # 4x4 projection matrix
v_proj = P_sub @ v
v_perp = v - v_proj
print(f'Projection onto S = span(q1, q2):')
print(f' v_S = {v_proj}')
print(f' v_S⊥ = {v_perp}')
check('P idempotent', P_sub @ P_sub, P_sub)
check('P symmetric', P_sub.T, P_sub)
check('v_perp ⊥ q1', cond=abs(v_perp @ q1) < 1e-12)
check('v_perp ⊥ q2', cond=abs(v_perp @ q2) < 1e-12)
check('||v||² = ||v_S||² + ||v_perp||²',
la.norm(v)**2, la.norm(v_proj)**2 + la.norm(v_perp)**2)
Code cell 17
# === 4.3 Best Approximation: Projection is the Nearest Point ===
# Visualize projection in R^2 and verify best-approximation property
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Left: 2D projection visualization
ax = axes[0]
origin = np.array([0, 0])
u_hat2 = np.array([1., 1.]) / np.sqrt(2)
v2 = np.array([2., 0.5])
proj2 = (v2 @ u_hat2) * u_hat2
# Draw the subspace line
t = np.linspace(-0.5, 3, 100)
ax.plot(t * u_hat2[0], t * u_hat2[1], 'b-', linewidth=2, label='Subspace S', alpha=0.6)
# Draw v and projection
ax.annotate('', xy=v2, xytext=origin,
arrowprops=dict(arrowstyle='->', color='black', lw=2))
ax.annotate('', xy=proj2, xytext=origin,
arrowprops=dict(arrowstyle='->', color='blue', lw=2.5))
ax.plot([v2[0], proj2[0]], [v2[1], proj2[1]], 'r--', linewidth=1.5, label='Residual (⊥ S)')
# Mark the right angle
ax.text(v2[0] + 0.05, v2[1] + 0.05, 'v', fontsize=14)
ax.text(proj2[0] + 0.05, proj2[1] - 0.15, r'$v_S$ (projection)', fontsize=12, color='blue')
ax.set_xlim(-0.3, 2.5)
ax.set_ylim(-0.5, 1.5)
ax.set_aspect('equal')
ax.legend()
ax.set_title('Orthogonal Projection onto S = span{$\hat{u}$}')
ax.set_xlabel('x')
ax.set_ylabel('y')
# Right: distances to subspace
ax2 = axes[1]
n_pts = 30
t_vals = np.linspace(-1, 4, n_pts)
distances = [la.norm(v2 - t * u_hat2) for t in t_vals]
t_opt = v2 @ u_hat2 # optimal t (= the projection coefficient)
ax2.plot(t_vals, distances, 'b-', linewidth=2)
ax2.axvline(t_opt, color='red', linestyle='--', label=f'Minimum at t={t_opt:.2f}')
ax2.scatter([t_opt], [la.norm(v2 - t_opt * u_hat2)], color='red', s=100, zorder=5)
ax2.set_xlabel('t (scalar multiple of û)')
ax2.set_ylabel('||v - t·û||')
ax2.set_title('Distance from v to t·û\n(minimum at projection coefficient)')
ax2.legend()
plt.tight_layout()
plt.savefig('/tmp/orth_projection.png', dpi=100, bbox_inches='tight')
plt.show()
print('Figure saved.')
else:
# Numerical demonstration
u_hat2 = np.array([1., 1.]) / np.sqrt(2)
v2 = np.array([2., 0.5])
proj2 = (v2 @ u_hat2) * u_hat2
t_opt = v2 @ u_hat2
t_vals = np.linspace(-1, 4, 20)
distances = [la.norm(v2 - t * u_hat2) for t in t_vals]
print(f'Projection coefficient t_opt = {t_opt:.4f}')
print(f'Minimum distance = {min(distances):.4f}')
print(f'Distance at t_opt = {la.norm(v2 - t_opt * u_hat2):.4f}')
check('t_opt minimizes distance', min(distances), la.norm(v2 - t_opt * u_hat2), tol=1e-4)
5. Gram-Schmidt Orthogonalization
Gram-Schmidt is the algorithmic heart of orthogonality: given any basis, produce an orthonormal one. We implement both Classical (CGS) and Modified (MGS) versions and study their numerical stability.
Code cell 19
# === 5.1–5.2 Classical vs Modified Gram-Schmidt ===
def cgs(A):
"""Classical Gram-Schmidt."""
m, n = A.shape
Q = np.zeros_like(A, dtype=float)
R = np.zeros((n, n))
for j in range(n):
v = A[:, j].copy().astype(float)
for i in range(j):
R[i, j] = Q[:, i] @ A[:, j]
v -= R[i, j] * Q[:, i]
R[j, j] = la.norm(v)
Q[:, j] = v / R[j, j]
return Q, R
def mgs(A):
"""Modified Gram-Schmidt."""
m, n = A.shape
Q = A.astype(float).copy()
R = np.zeros((n, n))
for j in range(n):
R[j, j] = la.norm(Q[:, j])
Q[:, j] /= R[j, j]
for k in range(j+1, n):
R[j, k] = Q[:, j] @ Q[:, k]
Q[:, k] -= R[j, k] * Q[:, j]
return Q, R
# Test on well-conditioned matrix
A = np.array([[1., 1., 0.],
[1., 0., 1.],
[0., 1., 1.]])
Q_cgs, R_cgs = cgs(A)
Q_mgs, R_mgs = mgs(A)
print('Testing Gram-Schmidt on well-conditioned matrix:')
check('CGS: Q^T Q = I', Q_cgs.T @ Q_cgs, np.eye(3))
check('CGS: Q R = A', Q_cgs @ R_cgs, A)
check('MGS: Q^T Q = I', Q_mgs.T @ Q_mgs, np.eye(3))
check('MGS: Q R = A', Q_mgs @ R_mgs, A)
print(f'\nQ from CGS:\n{Q_cgs}')
print(f'R from CGS:\n{R_cgs}')
Code cell 20
# === 5.2 Numerical Stability: CGS vs MGS on Ill-Conditioned Matrix ===
def orthogonality_error(Q):
"""Frobenius norm of Q^T Q - I."""
n = Q.shape[1]
return la.norm(Q.T @ Q - np.eye(n), 'fro')
# Hilbert matrix: famously ill-conditioned
ns = range(4, 14)
errs_cgs, errs_mgs, errs_house = [], [], []
for n in ns:
H = 1.0 / (np.arange(1, n+1)[:, None] + np.arange(1, n+1)[None, :] - 1)
Q_c, _ = cgs(H)
Q_m, _ = mgs(H)
Q_h, _ = la.qr(H) # Householder (numpy's implementation)
errs_cgs.append(orthogonality_error(Q_c))
errs_mgs.append(orthogonality_error(Q_m))
errs_house.append(orthogonality_error(Q_h))
ns_list = list(ns)
print('Orthogonality error ||Q^T Q - I||_F for Hilbert matrix:')
print(f'{"n":>4} {"CGS":>14} {"MGS":>14} {"Householder":>14}')
for i, n in enumerate(ns_list):
print(f'{n:>4} {errs_cgs[i]:14.2e} {errs_mgs[i]:14.2e} {errs_house[i]:14.2e}')
print('\nConclusion: Householder > MGS >> CGS for ill-conditioned matrices')
if HAS_MPL:
plt.figure(figsize=(9, 5))
plt.semilogy(ns_list, errs_cgs, 'r-o', label='Classical GS', linewidth=2)
plt.semilogy(ns_list, errs_mgs, 'b-s', label='Modified GS', linewidth=2)
plt.semilogy(ns_list, errs_house, 'g-^', label='Householder QR', linewidth=2)
plt.axhline(np.finfo(float).eps, color='gray', linestyle=':', label='Machine epsilon')
plt.xlabel('Matrix size n')
plt.ylabel('||Q^T Q - I||_F')
plt.title('Orthogonality Loss: CGS vs MGS vs Householder\n(Hilbert matrix, highly ill-conditioned)')
plt.legend()
plt.tight_layout()
plt.savefig('/tmp/orth_stability.png', dpi=100, bbox_inches='tight')
plt.show()
6. Householder Reflectors
Householder reflectors are the numerically stable way to perform orthogonal reductions. Each reflector zeros out a column below the diagonal with a single orthogonal transformation.
Code cell 22
# === 6.1 Householder Reflector Construction ===
def householder_reflector(a):
"""Build H = I - 2nn^T such that Ha = -||a||e1.
Sign chosen to avoid cancellation."""
n = len(a)
v = a.copy().astype(float)
# Add sign(a[0])*||a|| to first component
sign = 1.0 if a[0] >= 0 else -1.0
v[0] += sign * la.norm(a)
v /= la.norm(v) # normalize to get n
H = np.eye(n) - 2 * np.outer(v, v)
return H
# Test: zero out components of a below first entry
a = np.array([4., 3., 0.])
H = householder_reflector(a)
print(f'a = {a}, ||a|| = {la.norm(a):.4f}')
print(f'H a = {H @ a}')
print(f'Expected: [{-la.norm(a):.4f}, 0., 0.] or [{la.norm(a):.4f}, 0., 0.]')
check('H symmetric: H^T = H', H.T, H)
check('H orthogonal: H^T H = I', H.T @ H, np.eye(3))
check('H involutory: H^2 = I', H @ H, np.eye(3))
check('|det(H)| = 1', cond=abs(abs(la.det(H)) - 1) < 1e-12)
print(f'det(H) = {la.det(H):.4f} (reflection: det = -1)')
# Geometric verification: Ha zeros out lower components
Ha = H @ a
print(f'\nGeometric result: Ha = {Ha}')
print(f'Components below first: {Ha[1:]}')
check('Lower components zeroed', Ha[1:], np.zeros(2))
7. QR Decomposition: Theory and Applications
QR decomposition is the workhorse of numerical linear algebra. We demonstrate existence, uniqueness, and the key application: stable least squares.
Code cell 24
# === 7.2 Least Squares via QR vs Normal Equations ===
# Polynomial fitting: design matrix (Vandermonde)
m, d = 20, 8 # m data points, degree-d polynomial
np.random.seed(0)
x = np.linspace(0, 1, m)
y_true = np.sin(2 * np.pi * x)
y = y_true + 0.1 * np.random.randn(m)
# Design matrix: A[i,j] = x[i]^j
A = np.vander(x, d+1, increasing=True) # m x (d+1)
# Method 1: Normal equations (squares condition number)
ATA = A.T @ A
ATb = A.T @ y
c_normal = la.solve(ATA, ATb)
# Method 2: QR decomposition (stable)
Q_qr, R_qr = la.qr(A) # full QR
c_qr = solve_triangular(R_qr, Q_qr.T @ y)
# Condition numbers
kappa_A = la.cond(A)
kappa_ATA = la.cond(ATA)
print(f'Condition numbers:')
print(f' kappa(A) = {kappa_A:.2e}')
print(f' kappa(A^T A) = {kappa_ATA:.2e} (approximately kappa(A)^2)')
print(f' Ratio: {kappa_ATA / kappa_A**2:.3f} (should be ~1)')
# Compare residuals
res_normal = la.norm(A @ c_normal - y)
res_qr = la.norm(A @ c_qr - y)
print(f'\nResiduals:')
print(f' Normal equations: ||Ac - y|| = {res_normal:.8f}')
print(f' QR factorization: ||Ac - y|| = {res_qr:.8f}')
# Compare coefficients
coeff_diff = la.norm(c_normal - c_qr)
print(f' ||c_normal - c_qr|| = {coeff_diff:.2e}')
print(f'\nBoth methods agree on residual (same LS solution), but')
print(f'QR is more stable when kappa is large.')
8. The Spectral Theorem for Symmetric Matrices
Every symmetric matrix can be diagonalized by an orthogonal matrix. This underlies PCA, Hessian analysis, and the geometry of neural network loss surfaces.
Code cell 26
# === 8.1 Spectral Decomposition ===
A = np.array([[4., 2., 0.],
[2., 3., 1.],
[0., 1., 2.]])
eigvals, Q = np.linalg.eigh(A)
print(f'Eigenvalues: {eigvals} (all real — spectral theorem)')
check('Q is orthogonal: Q^T Q = I', Q.T @ Q, np.eye(3))
Lambda = np.diag(eigvals)
check('Spectral decomp: Q Lambda Q^T = A', Q @ Lambda @ Q.T, A)
# Each eigenpair A q_i = lambda_i q_i
for i in range(3):
check(f'A q{i+1} = lambda{i+1} q{i+1}', A @ Q[:,i], eigvals[i]*Q[:,i])
Code cell 27
# === 8.2 Rayleigh Quotient ===
A2 = np.array([[3., 1.], [1., 3.]])
lam_min, lam_max = np.linalg.eigvalsh(A2)
print(f'Eigenvalues: lambda_min={lam_min}, lambda_max={lam_max}')
# Rayleigh quotient over all unit-vector directions
thetas = np.linspace(0, 2*np.pi, 1000)
rq = np.array([(np.array([np.cos(t),np.sin(t)]) @ A2 @ np.array([np.cos(t),np.sin(t)]))
for t in thetas])
print(f'Rayleigh quotient range: [{rq.min():.6f}, {rq.max():.6f}]')
print(f'Eigenvalue range: [{lam_min:.6f}, {lam_max:.6f}]')
check('Min Rayleigh = lambda_min', rq.min(), lam_min, tol=1e-3)
check('Max Rayleigh = lambda_max', rq.max(), lam_max, tol=1e-3)
if HAS_MPL:
plt.figure(figsize=(9,4))
plt.plot(np.degrees(thetas), rq, 'b-', lw=2)
plt.axhline(lam_min, color='red', ls='--', label=f'lambda_min={lam_min}')
plt.axhline(lam_max, color='green', ls='--', label=f'lambda_max={lam_max}')
plt.xlabel('Direction (degrees)'); plt.ylabel('Rayleigh quotient')
plt.title('Rayleigh Quotient — bounded by eigenvalues')
plt.legend(); plt.tight_layout(); plt.show()
Code cell 28
# === 8.3 Positive Definiteness via Spectral Theorem ===
matrices = {
'PD [[4,1],[1,3]]': np.array([[4.,1.],[1.,3.]]),
'PSD [[1,1],[1,1]]': np.array([[1.,1.],[1.,1.]]),
'Indef [[2,-3],[-3,1]]': np.array([[2.,-3.],[-3.,1.]]),
'ND [[-4,-1],[-1,-3]]': np.array([[-4.,-1.],[-1.,-3.]]),
}
print(f'{'Matrix':<28} Eigenvalues Definite?')
print('-'*60)
for name, M in matrices.items():
eigs = np.linalg.eigvalsh(M)
if np.all(eigs > 0): defn = 'Positive Definite'
elif np.all(eigs >= 0): defn = 'Positive Semidefinite'
elif np.all(eigs < 0): defn = 'Negative Definite'
else: defn = 'Indefinite'
print(f'{name:<28} {str(eigs):<20} {defn}')
9. DFT as a Unitary Transform
The Discrete Fourier Transform is orthogonality in disguise: complex exponentials form an orthonormal basis for .
Code cell 30
# === 9.3 DFT Unitarity and Parseval ===
n = 8
omega = np.exp(-2j * np.pi / n)
j = np.arange(n)
F = omega ** np.outer(j, j) # DFT matrix
F_norm = F / np.sqrt(n) # unitary version
err = np.max(np.abs(F_norm @ F_norm.conj().T - np.eye(n)))
print(f'Unitarity: max|F/sqrt(n) (F/sqrt(n))* - I| = {err:.2e}')
check('DFT/sqrt(n) is unitary', cond=err < 1e-12)
# Parseval
x = np.random.randn(n)
x_hat = F @ x
check('Parseval: ||x_hat||^2/n = ||x||^2',
np.real(x_hat @ x_hat.conj()) / n, x @ x)
# DFT columns are orthonormal
for k in range(3):
ip = float(np.real(F_norm[:,0].conj() @ F_norm[:,k+1]))
print(f' <f_0, f_{k+1}> = {ip:.2e} (should be ~0)')
10. Applications in Machine Learning
Code cell 32
# === 10.1 Orthogonal Initialization: Gradient Norm Stability ===
n_dim, L_values = 32, [1, 2, 5, 10, 20]
n_trials = 15
print(f'{'L':>5} {'Gaussian σ_1':>14} {'Orthogonal σ_1':>16}')
for L in L_values:
g_norms, o_norms = [], []
for _ in range(n_trials):
Wg = [np.random.randn(n_dim,n_dim)/np.sqrt(n_dim) for _ in range(L)]
pg = Wg[0].copy()
for W in Wg[1:]: pg = pg @ W
g_norms.append(la.norm(pg, 2))
Wo = []
for _ in range(L):
Q, _ = la.qr(np.random.randn(n_dim,n_dim))
Wo.append(Q)
po = Wo[0].copy()
for W in Wo[1:]: po = po @ W
o_norms.append(la.norm(po, 2))
print(f'{L:>5} {np.mean(g_norms):14.4f} {np.mean(o_norms):16.4f}')
print('Orthogonal init: spectral norm stays ≈ 1.0 for any depth')
Code cell 33
# === 10.2 RoPE: Rotary Position Embeddings ===
def rope_R(pos, dim, base=10000):
d2 = dim // 2
thetas = base ** (-2*np.arange(d2)/dim)
angles = pos * thetas
R = np.zeros((dim, dim))
for i in range(d2):
c, s = np.cos(angles[i]), np.sin(angles[i])
R[2*i:2*i+2, 2*i:2*i+2] = [[c,-s],[s,c]]
return R
dim = 8
print('RoPE property: R_m^T R_n = R_{n-m} (encodes relative position)')
for m, n_pos in [(2,5), (1,4), (0,3)]:
Rm, Rn, Rrel = rope_R(m,dim), rope_R(n_pos,dim), rope_R(n_pos-m,dim)
check(f'R_{m}^T R_{n_pos} = R_{n_pos-m}', Rm.T @ Rn, Rrel)
check('RoPE matrix is orthogonal', rope_R(7,dim).T @ rope_R(7,dim), np.eye(dim))
Code cell 34
# === 10.3 Spectral Normalization ===
np.random.seed(42)
W = np.random.randn(64, 64)
sigma_1 = la.svd(W, compute_uv=False)[0]
W_sn = W / sigma_1
print(f'sigma_1(W) = {sigma_1:.4f}')
print(f'sigma_1(W/sigma_1) = {la.svd(W_sn, compute_uv=False)[0]:.6f} (should be 1.0)')
check('Spectrally normalized sigma_1 = 1', la.svd(W_sn,compute_uv=False)[0], 1.0)
# All singular values after normalization
svs = la.svd(W_sn, compute_uv=False)
print(f'Singular values range: [{svs.min():.4f}, {svs.max():.4f}]')
print('Only sigma_1 is clamped to 1; others remain < 1 (not full orthogonal)')
Code cell 35
# === Polar Decomposition: Nearest Orthogonal Matrix ===
from scipy.linalg import polar
A = np.random.randn(4, 4)
Q_p, S_p = polar(A) # A = Q S
U, sv, Vt = la.svd(A)
Q_svd = U @ Vt # = polar Q factor
check('Q^T Q = I', Q_p.T @ Q_p, np.eye(4))
check('S symmetric', S_p, S_p.T)
check('A = Q S', Q_p @ S_p, A)
check('Polar Q = U V^T', Q_p, Q_svd)
# Muon: Newton-Schulz orthogonalization
G = np.random.randn(32, 32) # gradient matrix
Q_exact, _ = polar(G)
X = G / la.norm(G)
print('Newton-Schulz iterations toward Q = polar(G):')
for step in range(7):
err = la.norm(X - Q_exact, 'fro')
print(f' iter {step}: ||X-Q||_F = {err:.3e}')
X = 1.5*X - 0.5*X @ X.T @ X
Summary
| Section | Key Concept | Key Formula |
|---|---|---|
| §1–2 | Inner product, angle, Parseval | ; |
| §3 | Orthogonal matrices, | ; |
| §4 | Orthogonal projections | , , |
| §5 | Gram-Schmidt (CGS vs MGS) | MGS error vs CGS |
| §6–7 | Householder QR | ; ; solve |
| §8 | Spectral theorem | ; |
| §9 | DFT (unitary) | unitary; Parseval in frequency domain |
| §10 | ML: init, RoPE, SN, polar | ; ; Newton-Schulz |
Exercises: exercises.ipynb — 8 graded problems. Notes: notes.md — full 2000+ line treatment.