Theory Notebook
Converted from
theory.ipynbfor web reading.
Matrix Norms
"The art of doing mathematics consists in finding that special case which contains all the germs of generality." — David Hilbert
Interactive theory notebook for the Matrix Norms section of Advanced Linear Algebra.
Sections covered:
- Norm Axioms and Intuition
- The Frobenius Norm
- Induced (Operator) Norms — Spectral, 1-norm, ∞-norm
- Schatten Norms and Nuclear Norm
- Unitarily Invariant Norms
- Condition Number
- Perturbation Theory (Weyl's Inequality)
- Applications in ML — Weight Decay, Spectral Norm, LoRA
- Proximal Methods and Singular Value Thresholding
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
import scipy.linalg as sla
try:
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['font.size'] = 12
HAS_MPL = True
except ImportError:
HAS_MPL = False
print('matplotlib not available')
try:
import seaborn as sns
HAS_SNS = True
except ImportError:
HAS_SNS = False
np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)
print('Setup complete. NumPy', np.__version__)
1. Norm Axioms and Intuition
A matrix norm on must satisfy:
- Positivity: , with
- Homogeneity:
- Triangle inequality:
Different norms measure different aspects of a matrix's 'size'.
Code cell 5
# === 1.1 Verifying Norm Axioms ===
A = np.array([[3., 1., 0.],
[0., 2., -1.],
[1., 0., 4.]])
B = np.random.randn(3, 3)
alpha = 2.5
# Check all four norms satisfy axioms
norms = {
'Frobenius': ('fro', la.norm(A, 'fro')),
'Spectral': (2, la.norm(A, 2)),
'Nuclear': ('nuc', la.norm(A, 'nuc')),
'Matrix-1': (1, la.norm(A, 1)),
'Matrix-inf':(np.inf,la.norm(A, np.inf)),
}
print('Norm values for A:')
for name, (p, val) in norms.items():
tri_ok = la.norm(A+B, p) <= la.norm(A, p) + la.norm(B, p) + 1e-12
hom_ok = abs(la.norm(alpha*A, p) - abs(alpha)*la.norm(A, p)) < 1e-10
print(f' {name:12s}: {val:.4f} | triangle={tri_ok} | homog={hom_ok}')
# Ordering: spectral <= Frobenius <= nuclear
s2 = la.norm(A, 2)
sf = la.norm(A, 'fro')
sn = la.norm(A, 'nuc')
r = np.linalg.matrix_rank(A)
print(f'\nOrdering check: ||A||_2={s2:.4f} <= ||A||_F={sf:.4f} <= ||A||_*={sn:.4f}')
print(f'Rank r={r}: ||A||_F <= sqrt(r)*||A||_2 = {np.sqrt(r)*s2:.4f} OK={sf <= np.sqrt(r)*s2+1e-10}')
1.2 Vector Norm Unit Balls
For vector norms on , different -norms give different unit balls. The matrix norms on singular value vectors exhibit the same geometry.
Code cell 7
# === 1.2 Unit Ball Visualization ===
if HAS_MPL:
theta = np.linspace(0, 2*np.pi, 400)
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
# ell-1 ball (diamond)
t = np.linspace(0, 2*np.pi, 400)
x1 = np.cos(t)
y1 = 1 - np.abs(x1) # |x|+|y|=1 => y=1-|x|
# Use parametric: all (s,t) with |s|+|t|=1
angles = np.array([0, np.pi/2, np.pi, 3*np.pi/2, 2*np.pi])
xs1 = np.array([1, 0, -1, 0, 1])
ys1 = np.array([0, 1, 0, -1, 0])
axes[0].fill(xs1, ys1, alpha=0.3, color='steelblue')
axes[0].plot(xs1, ys1, 'steelblue', lw=2)
axes[0].set_title('$\\ell^1$ ball (nuclear norm on $\\sigma$)')
axes[0].set_aspect('equal'); axes[0].axhline(0,color='k',lw=0.5); axes[0].axvline(0,color='k',lw=0.5)
# ell-2 ball (circle)
xs2 = np.cos(theta); ys2 = np.sin(theta)
axes[1].fill(xs2, ys2, alpha=0.3, color='coral')
axes[1].plot(xs2, ys2, 'coral', lw=2)
axes[1].set_title('$\\ell^2$ ball (Frobenius norm on $\\sigma$)')
axes[1].set_aspect('equal'); axes[1].axhline(0,color='k',lw=0.5); axes[1].axvline(0,color='k',lw=0.5)
# ell-inf ball (square)
xs3 = np.array([-1, 1, 1, -1, -1])
ys3 = np.array([-1, -1, 1, 1, -1])
axes[2].fill(xs3, ys3, alpha=0.3, color='seagreen')
axes[2].plot(xs3, ys3, 'seagreen', lw=2)
axes[2].set_title('$\\ell^\\infty$ ball (spectral norm on $\\sigma$)')
axes[2].set_aspect('equal'); axes[2].axhline(0,color='k',lw=0.5); axes[2].axvline(0,color='k',lw=0.5)
for ax in axes:
ax.set_xlabel('$\\sigma_1$'); ax.set_ylabel('$\\sigma_2$')
ax.set_xlim(-1.3,1.3); ax.set_ylim(-1.3,1.3)
plt.suptitle('Schatten norm unit balls (singular value geometry)', fontsize=13)
plt.tight_layout()
plt.show()
print('Geometry: nuclear=diamond, Frobenius=circle, spectral=square')
else:
print('matplotlib not available — skipping plot')
2. The Frobenius Norm
The Frobenius norm is the Euclidean norm on the space of matrices. It measures average energy over all directions.
Code cell 9
# === 2.1 Frobenius Norm: Three Equivalent Formulas ===
A = np.array([[3., 1., 0.],
[0., 2., -1.],
[1., 0., 4.]])
# Method 1: elementwise
fro1 = np.sqrt(np.sum(A**2))
# Method 2: trace formula
fro2 = np.sqrt(np.trace(A.T @ A))
# Method 3: SVD formula ||sigma||_2
sigma = la.svdvals(A)
fro3 = la.norm(sigma) # ell-2 norm of singular values
print(f'Method 1 (elementwise): {fro1:.8f}')
print(f'Method 2 (trace): {fro2:.8f}')
print(f'Method 3 (SVD): {fro3:.8f}')
ok = np.allclose([fro1, fro2, fro3], fro1)
print(f'\nPASS - all three formulas agree: {ok}')
print(f'\nSingular values: {sigma}')
print(f'||A||_F = sqrt({" + ".join(f"{s:.4f}^2" for s in sigma)}) = {fro1:.4f}')
Code cell 10
# === 2.2 Frobenius Norm: Unitary Invariance ===
# Unitary matrices preserve Frobenius norm
U, _, Vt = la.svd(np.random.randn(3, 3))
V = Vt.T
fro_A = la.norm(A, 'fro')
fro_UAV = la.norm(U @ A @ V.T, 'fro')
print(f'||A||_F = {fro_A:.8f}')
print(f'||UAV^T||_F = {fro_UAV:.8f}')
print(f'PASS - unitary invariance: {np.isclose(fro_A, fro_UAV)}')
# Submultiplicativity: ||AB||_F <= ||A||_F * ||B||_F
B = np.random.randn(3, 3)
lhs = la.norm(A @ B, 'fro')
rhs = la.norm(A, 'fro') * la.norm(B, 'fro')
print(f'\nSubmultiplicativity: ||AB||_F={lhs:.4f} <= ||A||_F*||B||_F={rhs:.4f}')
print(f'PASS - submultiplicativity: {lhs <= rhs + 1e-10}')
Code cell 11
# === 2.3 Frobenius Norm: Gradient (for ML) ===
# The gradient of ||W||_F^2 is 2W
# This is the weight decay gradient in neural network training
W = np.random.randn(4, 5)
lam = 0.01
# Exact gradient of lambda * ||W||_F^2 = 2*lambda*W
exact_grad = 2 * lam * W
# Finite difference check
eps = 1e-5
fd_grad = np.zeros_like(W)
for i in range(W.shape[0]):
for j in range(W.shape[1]):
W_plus = W.copy(); W_plus[i,j] += eps
W_minus = W.copy(); W_minus[i,j] -= eps
fd_grad[i,j] = (lam*la.norm(W_plus,'fro')**2 - lam*la.norm(W_minus,'fro')**2) / (2*eps)
print(f'Max gradient error (exact vs finite-diff): {np.max(np.abs(exact_grad - fd_grad)):.2e}')
print(f'PASS - gradient of lambda*||W||_F^2 is 2*lambda*W: {np.allclose(exact_grad, fd_grad, atol=1e-6)}')
print(f'\nWeight decay adds gradient 2*lambda*W (shrinks weights toward zero)')
3. Induced (Operator) Norms
The most important case is (spectral norm = ). Induced norms measure worst-case amplification over all input directions.
Code cell 13
# === 3.1 Spectral Norm = Sigma_1 ===
A = np.random.randn(5, 4)
# Method 1: direct
spec1 = la.norm(A, 2)
# Method 2: largest singular value
sigma = la.svdvals(A)
spec2 = sigma[0]
# Method 3: verify by maximization over random unit vectors
n_trials = 10000
X = np.random.randn(A.shape[1], n_trials)
X /= la.norm(X, axis=0, keepdims=True)
max_empirical = np.max(la.norm(A @ X, axis=0))
print(f'||A||_2 via norm(): {spec1:.6f}')
print(f'||A||_2 via sigma_1: {spec2:.6f}')
print(f'||A||_2 empirical max: {max_empirical:.6f} (approaches true max)')
print(f'\nPASS - spectral norm = sigma_1: {np.isclose(spec1, spec2)}')
# Maximizer is the top right singular vector
U, s, Vt = la.svd(A)
v1 = Vt[0] # top right singular vector
print(f'\nAmplification at v_1: {la.norm(A @ v1):.6f} (should equal sigma_1={s[0]:.6f})')
print(f'PASS - maximizer is v_1: {np.isclose(la.norm(A @ v1), s[0])}')
Code cell 14
# === 3.2 Power Iteration for Spectral Norm ===
def spectral_norm_power_iter(A, n_iter=20, seed=0):
"""Estimate sigma_1(A) via power iteration on A^T A."""
rng = np.random.default_rng(seed)
v = rng.standard_normal(A.shape[1])
v /= la.norm(v)
for _ in range(n_iter):
u = A @ v; u /= la.norm(u)
v = A.T @ u
sigma = la.norm(v)
v /= sigma
return sigma
A = np.random.randn(8, 6)
true_sigma1 = la.svdvals(A)[0]
# Convergence across iterations
estimates = []
v = np.random.randn(A.shape[1]); v /= la.norm(v)
for k in range(25):
u = A @ v; u /= la.norm(u)
v = A.T @ u; sigma = la.norm(v); v /= sigma
estimates.append(sigma)
print(f'True sigma_1: {true_sigma1:.8f}')
print(f'After 5 iterations: {estimates[4]:.8f}')
print(f'After 15 iterations: {estimates[14]:.8f}')
print(f'After 25 iterations: {estimates[24]:.8f}')
sigma2 = la.svdvals(A)[1]
ratio = sigma2/true_sigma1
print(f'\nConvergence rate: (sigma_2/sigma_1)^2 per iteration = {ratio**2:.4f}')
print(f'Expected ~{int(np.log(1e-6)/np.log(ratio**2))} iterations for 1e-6 accuracy')
Code cell 15
# === 3.3 Matrix 1-norm and Infinity-norm ===
A = np.array([[1., 2., -3.],
[-4., 5., 1.],
[2., -1., 6.]])
# Matrix 1-norm: max absolute column sum
col_sums = np.sum(np.abs(A), axis=0)
norm1_manual = np.max(col_sums)
norm1_numpy = la.norm(A, 1)
# Matrix inf-norm: max absolute row sum
row_sums = np.sum(np.abs(A), axis=1)
norminf_manual = np.max(row_sums)
norminf_numpy = la.norm(A, np.inf)
print('Matrix 1-norm (max column sum):')
print(f' Column sums: {col_sums}')
print(f' ||A||_1 = {norm1_manual:.1f} (numpy: {norm1_numpy:.1f})')
print(f' PASS: {np.isclose(norm1_manual, norm1_numpy)}')
print('\nMatrix inf-norm (max row sum):')
print(f' Row sums: {row_sums}')
print(f' ||A||_inf = {norminf_manual:.1f} (numpy: {norminf_numpy:.1f})')
print(f' PASS: {np.isclose(norminf_manual, norminf_numpy)}')
# Duality: ||A^T||_1 = ||A||_inf
print(f'\nDuality: ||A^T||_1={la.norm(A.T,1):.1f} = ||A||_inf={la.norm(A,np.inf):.1f}')
print(f'PASS - duality: {np.isclose(la.norm(A.T, 1), la.norm(A, np.inf))}')
4. Schatten Norms and the Nuclear Norm
The Schatten -norm applies to the vector of singular values:
Special cases: = nuclear norm, = Frobenius, = spectral.
Code cell 17
# === 4.1 Schatten Norms ===
import numpy as np
import numpy.linalg as la
np.random.seed(42)
A = np.random.randn(5, 4)
sigma = la.svdvals(A)
print(f'Singular values: {sigma}')
for p in [1, 2, 3, 5, np.inf]:
if p == np.inf:
sp = np.max(sigma)
label = 'S_inf (spectral)'
else:
sp = np.sum(sigma**p)**(1/p)
label = f'S_{int(p)}' + (' (nuclear)' if p==1 else ' (Frobenius)' if p==2 else '')
print(f' ||A||_{label}: {sp:.6f}')
# Ordering: S_inf <= S_2 <= S_1
sinf = np.max(sigma)
s2 = la.norm(sigma)
s1 = np.sum(sigma)
print(f'\nOrdering: S_inf={sinf:.4f} <= S_2={s2:.4f} <= S_1={s1:.4f}')
print(f'PASS - ordering holds: {sinf <= s2 <= s1}')
# Verify with numpy
print(f'\nnp.linalg.norm(A, 2) = {la.norm(A, 2):.6f} (spectral)')
print(f'np.linalg.norm(A,"fro") = {la.norm(A,"fro"):.6f} (Frobenius)')
print(f'np.linalg.norm(A,"nuc") = {la.norm(A,"nuc"):.6f} (nuclear)')
Code cell 18
# === 4.2 Nuclear Norm: The Convex Proxy for Rank ===
# Nuclear norm = sum of all singular values
# For rank-k matrix: nuclear = sum of k positive singular values
# For rank-1 matrix: nuclear = spectral = Frobenius
np.random.seed(1)
u = np.random.randn(5)
v = np.random.randn(4)
A_rank1 = np.outer(u, v)
sigma_r1 = la.svdvals(A_rank1)
print('Rank-1 matrix: A = u v^T')
print(f' Singular values: {np.round(sigma_r1, 6)}')
print(f' (only one nonzero: sigma_1 = ||u||*||v|| = {la.norm(u)*la.norm(v):.4f})')
print(f' ||A||_2 = {la.norm(A_rank1, 2):.4f}')
print(f' ||A||_F = {la.norm(A_rank1, "fro"):.4f}')
print(f' ||A||_* = {la.norm(A_rank1, "nuc"):.4f}')
print(f' PASS - all three equal for rank-1: {np.allclose([la.norm(A_rank1, k) for k in [2, "fro", "nuc"]], la.norm(A_rank1, 2))}')
# Nuclear norm as convex relaxation of rank
print('\nNuclear norm vs rank across random matrices:')
for k in [1, 2, 3, 4]:
U = la.qr(np.random.randn(5, k))[0]
V = la.qr(np.random.randn(4, k))[0]
B = U @ np.diag(np.ones(k)) @ V.T
print(f' rank={k}: ||B||_* = {la.norm(B, "nuc"):.4f}, rank(B) = {np.linalg.matrix_rank(B)}')
Code cell 19
# === 4.3 Singular Value Thresholding (Proximal Operator) ===
def svt(A, threshold):
"""Singular Value Thresholding: proximal operator of threshold * ||.||_*."""
U, s, Vt = la.svd(A, full_matrices=False)
s_thresh = np.maximum(s - threshold, 0.0)
return U @ np.diag(s_thresh) @ Vt, s_thresh
A = np.random.randn(5, 4)
print('Original singular values:', np.round(la.svdvals(A), 4))
for tau in [0.5, 1.0, 2.0]:
B, s_new = svt(A, tau)
print(f'\nSVT(A, tau={tau}): sigma={np.round(s_new, 4)}, rank={np.sum(s_new > 0)}')
print(f' ||B||_* = {la.norm(B,"nuc"):.4f}')
# Verify it's the proximal operator: argmin (1/2)||X-A||_F^2 + tau*||X||_*
# By checking stationarity: A - B is in tau * partial ||B||_*
tau = 1.0
B, s_new = svt(A, tau)
residual = A - B
print(f'\nVerification (tau={tau}):')
print(f' ||A - SVT(A,tau)||_2 = {la.norm(residual, 2):.4f} (should = tau={tau} when rank > 0)')
print(f' PASS: {la.norm(residual, 2) <= tau + 1e-8}')
5. Unitarily Invariant Norms
A norm is unitarily invariant if for all unitary . By the Von Neumann characterization, these are exactly the norms that depend only on the singular values of via a symmetric gauge function.
Code cell 21
# === 5.1 Verifying Unitary Invariance ===
A = np.random.randn(4, 5)
# Generate random unitary matrices
U = la.qr(np.random.randn(4, 4))[0]
V = la.qr(np.random.randn(5, 5))[0]
UAV = U @ A @ V
print('Unitary invariance check: ||UAV|| == ||A||')
for name, p in [('Frobenius','fro'), ('Spectral', 2), ('Nuclear','nuc')]:
norm_A = la.norm(A, p)
norm_UAV = la.norm(UAV, p)
ok = np.isclose(norm_A, norm_UAV)
print(f' {name:10s}: ||A||={norm_A:.6f}, ||UAV||={norm_UAV:.6f}, PASS={ok}')
# Matrix 1-norm is NOT unitarily invariant
norm_A_1 = la.norm(A, 1)
norm_UAV_1 = la.norm(UAV, 1)
print(f' Matrix-1 : ||A||={norm_A_1:.6f}, ||UAV||={norm_UAV_1:.6f}, NOT invariant')
Code cell 22
# === 5.2 Weyl's Inequality for Singular Values ===
# |sigma_i(A+E) - sigma_i(A)| <= sigma_1(E) = ||E||_2
m, n = 6, 5
A = np.random.randn(m, n)
E = np.random.randn(m, n)
sigma_A = la.svdvals(A)
sigma_AE = la.svdvals(A + E)
norm_E = la.norm(E, 2)
diffs = np.abs(sigma_AE - sigma_A)
print(f'||E||_2 = {norm_E:.4f} (Weyl bound)')
print('|sigma_i(A+E) - sigma_i(A)| for each i:')
for i, d in enumerate(diffs):
ok = d <= norm_E + 1e-10
print(f' i={i}: {d:.4f} <= {norm_E:.4f} PASS={ok}')
print(f'\nPASS - Weyl inequality holds for all i: {np.all(diffs <= norm_E + 1e-10)}')
# Mirsky's theorem: ||sigma(A+E) - sigma(A)||_2 <= ||E||_F
diff_vec = la.norm(sigma_AE - sigma_A)
norm_E_F = la.norm(E, 'fro')
print(f'\nMirsky: ||sigma(A+E)-sigma(A)||_2 = {diff_vec:.4f} <= ||E||_F = {norm_E_F:.4f}')
print(f'PASS - Mirsky: {diff_vec <= norm_E_F + 1e-10}')
6. Condition Number
The condition number measures sensitivity of linear system solutions to perturbations. Each power of 10 in costs one digit of precision in double arithmetic.
Code cell 24
# === 6.1 Condition Number and Numerical Stability ===
np.random.seed(42)
def make_cond(n, kappa):
"""Construct n x n matrix with condition number approximately kappa."""
Q, _ = la.qr(np.random.randn(n, n))
sigmas = np.logspace(0, np.log10(kappa), n) # sigma_1=kappa, sigma_n=1
return Q @ np.diag(sigmas) @ Q.T # symmetric PD
x_true = np.ones(5)
results = []
for kappa_target in [1, 10, 1e3, 1e6, 1e10]:
A = make_cond(5, kappa_target)
b = A @ x_true
kappa_actual = la.cond(A)
# Add tiny noise to b
eps = 1e-10
b_noisy = b + eps * np.random.randn(5)
x_noisy = la.solve(A, b_noisy)
rel_err = la.norm(x_noisy - x_true) / la.norm(x_true)
rel_noise = la.norm(b_noisy - b) / la.norm(b)
amplification = rel_err / rel_noise if rel_noise > 0 else 0
results.append((kappa_actual, rel_err, amplification))
print(f' {"kappa":>12s} {"rel error":>12s} {"amplification":>14s}')
for k, e, a in results:
print(f' {k:12.1e} {e:12.2e} {a:14.2f}')
print('\nObservation: error amplification ~ kappa (as theory predicts)')
Code cell 25
# === 6.2 Tikhonov Regularization ===
np.random.seed(7)
n = 5
# Ill-conditioned A
A = make_cond(n, 1e8)
x_true = np.ones(n)
b = A @ x_true + 0.001 * np.random.randn(n) # b with noise
def tikhonov(A, b, lam):
return la.solve(A.T @ A + lam * np.eye(A.shape[1]), A.T @ b)
print(f'Original condition number: {la.cond(A):.2e}')
print(f'\n lambda kappa(reg.) rel_err comment')
for lam in [0, 1e-6, 1e-4, 1e-2, 1.0]:
reg = A.T @ A + lam * np.eye(n)
kappa_reg = la.cond(reg)
if lam == 0:
x_hat = la.lstsq(A, b, rcond=None)[0]
else:
x_hat = tikhonov(A, b, lam)
err = la.norm(x_hat - x_true) / la.norm(x_true)
note = '(no reg)' if lam==0 else ''
print(f' {lam:.0e} {kappa_reg:.2e} {err:.4f} {note}')
print('\nObservation: regularization trades condition number for solution bias')
7. Perturbation Theory
Weyl's inequality: (already shown in §5.2)
Bauer-Fike: For non-normal matrices, eigenvalues can be much more sensitive.
Here we compare singular value stability vs. eigenvalue sensitivity.
Code cell 27
# === 7.1 Eigenvalue Sensitivity vs. Singular Value Stability ===
np.random.seed(42)
# Normal matrix: eigenvalue sensitivity ~ ||E||_2 (like Weyl)
A_normal = np.diag([5., 3., 1.]) # symmetric, easy eigenvalues
# Non-normal matrix: Jordan block (extreme sensitivity)
n = 4
A_nonnorm = np.diag(np.zeros(n)) + np.diag(np.ones(n-1), 1) # upper triangular
# All eigenvalues = 0, but spectral radius = 0
eps = 0.01
n_trials = 200
eig_perturb_normal = []
eig_perturb_nonnorm = []
for _ in range(n_trials):
E = eps * np.random.randn(*A_normal.shape)
eigs_orig = np.sort(np.abs(la.eigvals(A_normal)))
eigs_pert = np.sort(np.abs(la.eigvals(A_normal + E)))
eig_perturb_normal.append(np.max(np.abs(eigs_pert - eigs_orig)))
for _ in range(n_trials):
E = eps * np.random.randn(*A_nonnorm.shape)
eigs_orig = la.eigvals(A_nonnorm)
eigs_pert = la.eigvals(A_nonnorm + E)
# Hungarian matching
eig_perturb_nonnorm.append(np.max(np.abs(eigs_pert)))
print(f'Normal matrix: mean |Delta lambda| = {np.mean(eig_perturb_normal):.4f} (E has ||E||_2 ~ {eps})')
print(f'Non-normal: max |lambda_i(A+E)| = {np.mean(eig_perturb_nonnorm):.4f}')
print(f'Ratio: {np.mean(eig_perturb_nonnorm)/np.mean(eig_perturb_normal):.1f}x more sensitive for non-normal')
print('\nConclusion: singular values are stable (Weyl); eigenvalues of non-normal matrices are not')
8. Applications in Machine Learning
Matrix norms appear in every major area of ML: regularization (weight decay, nuclear norm), GAN training (spectral normalization), Lipschitz bounds, low-rank adaptation (LoRA), and generalization theory (PAC-Bayes). This section demonstrates each connection computationally.
Code cell 29
# === 8.1 Weight Decay = Frobenius Norm Regularization ===
import numpy as np
import numpy.linalg as la
np.random.seed(42)
# Simple 1-layer linear model: minimize ||XW - Y||_F^2 + lambda*||W||_F^2
# Optimal W* = (X^T X + lambda I)^{-1} X^T Y (Ridge regression)
n_samples, d_in, d_out = 20, 10, 3
X = np.random.randn(n_samples, d_in)
W_true = np.random.randn(d_in, d_out)
Y = X @ W_true + 0.5 * np.random.randn(n_samples, d_out)
print(f'Data shape: X={X.shape}, Y={Y.shape}, W_true has ||W||_F={la.norm(W_true, "fro"):.3f}')
for lam in [0.0, 0.01, 0.1, 1.0, 10.0]:
# Ridge solution
W_ridge = la.solve(X.T @ X + lam * np.eye(d_in), X.T @ Y)
train_loss = la.norm(X @ W_ridge - Y, 'fro')**2 / n_samples
fro_norm = la.norm(W_ridge, 'fro')
param_err = la.norm(W_ridge - W_true, 'fro')
print(f' lambda={lam:.2f}: train_loss={train_loss:.3f}, ||W||_F={fro_norm:.3f}, param_err={param_err:.3f}')
print('\nObservation: larger lambda -> smaller ||W||_F, but higher bias (train loss increases)')
Code cell 30
# === 8.2 Spectral Normalization for Neural Networks ===
np.random.seed(0)
def spectral_norm_power_iter(W, u, n_iter=1):
"""One step of power iteration for spectral norm (as in PyTorch SN)."""
for _ in range(n_iter):
v = W.T @ u
v = v / la.norm(v)
u = W @ v
sigma = la.norm(u)
u = u / sigma
return sigma, u, v
# Multi-layer network: Lipschitz constant = product of spectral norms
layers = [np.random.randn(64, 128), np.random.randn(32, 64), np.random.randn(16, 32)]
print('Without spectral normalization:')
lip_product = 1.0
for i, W in enumerate(layers):
sigma1 = la.svdvals(W)[0]
lip_product *= sigma1
print(f' Layer {i}: ||W||_2={sigma1:.4f}')
print(f' Network Lipschitz bound: {lip_product:.4f}')
print('\nWith spectral normalization (divide each W by its sigma_1):')
lip_product_sn = 1.0
for i, W in enumerate(layers):
sigma1 = la.svdvals(W)[0]
W_sn = W / sigma1
sigma1_sn = la.svdvals(W_sn)[0]
lip_product_sn *= sigma1_sn
print(f' Layer {i}: ||W_SN||_2={sigma1_sn:.6f} (exactly 1)')
print(f' Network Lipschitz bound: {lip_product_sn:.6f} (exactly 1)')
Code cell 31
# === 8.3 Eckart-Young Theorem and Low-Rank Approximation ===
np.random.seed(3)
# Simulate a weight matrix (e.g., attention projection)
d = 64
W = np.random.randn(d, d)
U, s, Vt = la.svd(W)
print(f'Weight matrix W: {W.shape}, rank={d}')
print(f'Singular value decay: sigma_1={s[0]:.3f}, sigma_16={s[15]:.3f}, sigma_32={s[31]:.3f}, sigma_64={s[63]:.3f}')
# Compute compression ratios and errors for rank-k approximations
print(f'\n k ||W-W_k||_F ||W-W_k||_2 compressed_params ratio')
for k in [1, 4, 8, 16, 32, 48, 64]:
err_F = np.sqrt(np.sum(s[k:]**2))
err_2 = s[k] if k < d else 0.0
compressed = k * (d + d) # k*(d_in + d_out)
total = d * d
ratio = total / compressed if compressed > 0 else float('inf')
print(f' {k:3d} {err_F:10.4f} {err_2:10.4f} {compressed:16d} {ratio:5.1f}x')
print('\nEckart-Young: rank-k SVD truncation is OPTIMAL for both F and 2-norm')
Code cell 32
# === 8.4 LoRA: Low-Rank Adaptation ===
np.random.seed(42)
# Simulate LoRA fine-tuning
d = 32 # model dimension
r_values = [1, 2, 4, 8, 16] # LoRA ranks
# Pretrained weight
W0 = np.random.randn(d, d)
# True weight update (low-rank by assumption)
r_true = 3
U_true = np.random.randn(d, r_true)
V_true = np.random.randn(d, r_true)
DeltaW_true = 0.1 * (U_true @ V_true.T)
print(f'True update: rank={r_true}, ||DeltaW||_F={la.norm(DeltaW_true,"fro"):.4f}')
print(f'True update: ||DeltaW||_*={la.norm(DeltaW_true,"nuc"):.4f}')
print(f'Nuclear/Frobenius ratio: {la.norm(DeltaW_true,"nuc")/la.norm(DeltaW_true,"fro"):.4f}')
print(f'\n r params ||BA||_* ||BA||_F recovery_err')
for r in r_values:
# Random init (as in LoRA paper: B=0, A=Gaussian)
B_init = np.zeros((d, r))
A_init = np.random.randn(r, d) * 0.02
# Optimal fit: find best rank-r approx to DeltaW_true
U, s, Vt = la.svd(DeltaW_true)
B_opt = U[:, :r] * s[:r]
A_opt = Vt[:r, :]
DeltaW_approx = B_opt @ A_opt
params = r * (d + d)
nuc = la.norm(DeltaW_approx, 'nuc')
fro = la.norm(DeltaW_approx, 'fro')
err = la.norm(DeltaW_approx - DeltaW_true, 'fro')
print(f' {r:2d} {params:6d} {nuc:.4f} {fro:.4f} {err:.6f}')
9. Proximal Methods and Nuclear Norm Minimization
The proximal operator of the nuclear norm is singular value thresholding (SVT). This enables efficient optimization of nuclear-norm regularized problems.
Code cell 34
# === 9.1 Proximal Gradient Descent for Nuclear Norm Minimization ===
np.random.seed(0)
def svt(A, tau):
"""Proximal operator of tau * ||.||_*"""
U, s, Vt = la.svd(A, full_matrices=False)
return U @ np.diag(np.maximum(s - tau, 0)) @ Vt
# Matrix completion: recover low-rank matrix from partial observations
m, n, r_true = 10, 8, 2
U_true = np.random.randn(m, r_true)
V_true = np.random.randn(n, r_true)
M_true = U_true @ V_true.T # rank-2 ground truth
# Observe 40% of entries randomly
obs_rate = 0.4
Omega = np.random.rand(m, n) < obs_rate # observation mask
M_obs = M_true * Omega # observed (zero elsewhere)
print(f'True matrix: {m}x{n}, rank={r_true}')
print(f'Observations: {Omega.sum()} / {m*n} entries ({100*obs_rate:.0f}%)')
print(f'True ||M||_*: {la.norm(M_true,"nuc"):.4f}')
# Proximal gradient descent: min (1/2)||P_Omega(X - M_obs)||_F^2 + lambda*||X||_*
lam = 0.1
step = 0.5
X = np.zeros((m, n))
losses = []
for t in range(50):
grad = (X - M_obs) * Omega # gradient of data term
X = svt(X - step * grad, step * lam)
loss = 0.5 * la.norm((X - M_obs) * Omega, 'fro')**2 + lam * la.norm(X, 'nuc')
losses.append(loss)
print(f'\nAfter 50 iterations of proximal gradient:')
print(f' Recovery error ||X - M_true||_F: {la.norm(X - M_true, "fro"):.4f}')
print(f' Rank of recovered X: {np.linalg.matrix_rank(X, tol=1e-4)}')
print(f' ||X||_*: {la.norm(X, "nuc"):.4f}')
Code cell 35
# === 9.2 Stable Rank: A Smooth Proxy for Rank ===
np.random.seed(1)
def stable_rank(A):
s = la.svdvals(A)
return np.sum(s**2) / s[0]**2
print('Stable rank rho(A) = ||A||_F^2 / ||A||_2^2')
print(f' Range: [1, rank(A)]')
print()
# Explore how stable rank varies
for case, A in [
('Identity I_5', np.eye(5)),
('Rank-1 matrix', np.outer(np.random.randn(5), np.random.randn(7))),
('Full-rank random', np.random.randn(5, 7)),
('Near-rank-2 random', np.random.randn(5,2) @ np.random.randn(2,7) + 0.01*np.random.randn(5,7)),
('Uniform sigmas', np.ones(5)[:, None] * np.random.randn(1, 7)),
]:
r = np.linalg.matrix_rank(A)
rho = stable_rank(A)
print(f' {case:25s}: rank={r:2d}, rho={rho:.3f}')
print('\nPAC-Bayes: generalization gap ~ sqrt(rho(W) * ||W||_2^2 / n)')
print('=> matrices with low stable rank generalize better (for fixed spectral norm)')
Code cell 36
# === 9.3 Dual Norms: Nuclear ↔ Spectral ===
np.random.seed(5)
A = np.random.randn(4, 5)
# Verify: ||A||_* = max_{||B||_2 <= 1} tr(A^T B)
# Maximizer is B = U V^T (from SVD of A)
U, s, Vt = la.svd(A, full_matrices=False)
B_opt = U @ Vt # optimal B (||B||_2 = 1)
nuc_norm = la.norm(A, 'nuc')
trace_at_opt = np.trace(A.T @ B_opt)
print(f'||A||_* = {nuc_norm:.6f}')
print(f'tr(A^T B_opt) = {trace_at_opt:.6f} (B_opt = U V^T from SVD)')
print(f'||B_opt||_2 = {la.norm(B_opt, 2):.6f} (should be 1)')
print(f'PASS - nuclear = max trace inner product: {np.isclose(nuc_norm, trace_at_opt)}')
# Verify suboptimality of random B with ||B||_2 <= 1
max_trace_random = 0
for _ in range(5000):
B_rand = np.random.randn(*A.shape)
B_rand /= la.norm(B_rand, 2) # scale to ||B||_2 = 1
max_trace_random = max(max_trace_random, np.trace(A.T @ B_rand))
print(f'\nBest random B over 5000 trials: tr = {max_trace_random:.4f}')
print(f'Optimal B achieves: {trace_at_opt:.4f} ({100*(trace_at_opt-max_trace_random)/trace_at_opt:.2f}% better)')
9.4 Visualization: Norm Landscape
How norms of a matrix relate to its singular value spectrum.
Code cell 38
# === 9.4 Visualization: Norms vs Singular Value Spectra ===
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
HAS_MPL = True
except ImportError:
HAS_MPL = False
np.random.seed(42)
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Left: compare norms across matrices with varying rank
ranks = range(1, 21)
fro_vals, spec_vals, nuc_vals, rho_vals = [], [], [], []
for r in ranks:
# Random rank-r matrix, normalized
U, _ = np.linalg.qr(np.random.randn(20, r))
V, _ = np.linalg.qr(np.random.randn(20, r))
A = U @ np.diag(np.ones(r)) @ V.T
fro_vals.append(la.norm(A, 'fro'))
spec_vals.append(la.norm(A, 2))
nuc_vals.append(la.norm(A, 'nuc'))
rho_vals.append(stable_rank(A))
ax = axes[0]
ax.plot(ranks, spec_vals, 'o-', label='Spectral $\|A\|_2$', color='steelblue')
ax.plot(ranks, fro_vals, 's-', label='Frobenius $\|A\|_F$', color='coral')
ax.plot(ranks, nuc_vals, '^-', label='Nuclear $\|A\|_*$', color='seagreen')
ax.set_xlabel('Rank of A'); ax.set_ylabel('Norm value')
ax.set_title('Norms grow with rank\n(equal singular values $= 1$)')
ax.legend()
# Right: stable rank vs true rank
ax2 = axes[1]
ax2.plot(ranks, rho_vals, 'o-', color='purple', label='Stable rank $\\rho(A)$')
ax2.plot(ranks, list(ranks), '--', color='gray', label='True rank')
ax2.set_xlabel('True rank'); ax2.set_ylabel('Stable rank')
ax2.set_title('Stable rank vs true rank\n(equal singular values: rho = rank)')
ax2.legend()
plt.suptitle('Matrix norm behavior with rank', fontsize=13)
plt.tight_layout()
plt.show()
print('Key: nuclear norm grows linearly in rank; spectral stays constant; Frobenius grows as sqrt(rank)')
10. Summary
| Norm | Formula | Computed via | AI connection |
|---|---|---|---|
| Frobenius | norm(A,'fro') | Weight decay L2 | |
| Spectral | norm(A,2) | Lipschitz bound; GAN SN | |
| Nuclear | norm(A,'nuc') | Rank proxy; LoRA implicit reg | |
| Matrix-1 | max col sum | norm(A,1) | Input/output norm bounds |
| Condition | cond(A) | Numerical stability; optim | |
| Stable rank | custom fn | PAC-Bayes generalization |
Weyl's inequality makes singular values numerically stable (Lipschitz-1 in ).
Eckart-Young makes SVD the optimal low-rank approximation method.
SVT (singular value thresholding) is the proximal operator for nuclear norm regularization.
Code cell 40
# === 10. Final Check: Run all key computations ===
np.random.seed(42)
A = np.random.randn(5, 4)
s = la.svdvals(A)
r = np.linalg.matrix_rank(A)
fro = la.norm(A, 'fro')
spec = la.norm(A, 2)
nuc = la.norm(A, 'nuc')
rho = fro**2 / spec**2
print('=== Final Summary ===')
print(f'A: {A.shape}, rank={r}')
print(f'Singular values: {np.round(s, 4)}')
print(f'||A||_2 = {spec:.4f} = sigma_1')
print(f'||A||_F = {fro:.4f} = ||sigma||_2')
print(f'||A||_* = {nuc:.4f} = sum(sigma)')
print(f'rho(A) = {rho:.4f} (stable rank, in [1, {r}])')
# Key inequalities
print('\n=== Key Inequalities ===')
checks = [
('||A||_2 <= ||A||_F', spec <= fro),
('||A||_F <= sqrt(r)*||A||_2', fro <= np.sqrt(r)*spec + 1e-10),
('||A||_F <= ||A||_*', fro <= nuc + 1e-10),
('||A||_* <= sqrt(r)*||A||_F', nuc <= np.sqrt(r)*fro + 1e-10),
('1 <= rho <= rank', 1 <= rho <= r + 1e-10),
]
for desc, ok in checks:
print(f' {"PASS" if ok else "FAIL"} - {desc}')
11. Visualization: Condition Number and Loss Landscape
The condition number of the Hessian determines the shape of the loss landscape. High means elongated elliptical contours — gradient descent zigzags slowly, while Newton's method (which uses ) converges quickly regardless of .
Code cell 42
# === 11.1 Visualizing Ill-Conditioned Quadratics ===
import numpy as np
import numpy.linalg as la
try:
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
HAS_MPL = True
except ImportError:
HAS_MPL = False
np.random.seed(42)
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
x_range = np.linspace(-3, 3, 200)
y_range = np.linspace(-3, 3, 200)
X, Y = np.meshgrid(x_range, y_range)
for ax, kappa, title in zip(axes,
[1, 10, 100],
['kappa=1 (sphere)', 'kappa=10 (ellipse)', 'kappa=100 (flat valley)']):
# Loss = x^2/(2*lambda_1) + y^2/(2*lambda_2)
# Hessian = diag(lambda_1, lambda_2), kappa = lambda_2/lambda_1
lam1, lam2 = 1.0, kappa
Z = 0.5 * (lam1 * X**2 + lam2 * Y**2)
# Contour plot
levels = np.logspace(-1, 1.5, 15)
ax.contourf(X, Y, Z, levels=levels, cmap='viridis', alpha=0.7)
ax.contour(X, Y, Z, levels=levels, colors='white', linewidths=0.5, alpha=0.5)
# Gradient descent trajectory from (2, 0.5)
lr = 1.0 / lam2 # step size limited by larger eigenvalue
x_gd = [2.0, 0.5]
for _ in range(20):
grad = np.array([lam1 * x_gd[-2], lam2 * x_gd[-1]])
x_gd = [x_gd[-2] - lr * grad[0], x_gd[-1] - lr * grad[1]]
# Simulate trajectory
xs, ys = [2.0], [0.5]
for _ in range(15):
xs.append(xs[-1] - lr * lam1 * xs[-1])
ys.append(ys[-1] - lr * lam2 * ys[-1])
ax.plot(xs, ys, 'r.-', lw=1.5, ms=4, label='GD')
ax.set_title(f'{title}\n$\\kappa$={kappa}')
ax.set_xlabel('$x_1$'); ax.set_ylabel('$x_2$')
ax.set_xlim(-2.5, 2.5); ax.set_ylim(-2.5, 2.5)
plt.suptitle('Effect of condition number on gradient descent convergence', fontsize=13)
plt.tight_layout()
plt.show()
print('High kappa: gradient descent zigzags (slow); Newton is unaffected')
12. Norm Comparison: Complete Example
A comprehensive comparison of all matrix norms on a concrete example, showing all relationships and verifying theoretical inequalities.
Code cell 44
# === 12.1 Complete Norm Taxonomy on One Matrix ===
# The matrix A = diag(4, 2, 1) from the notes
A = np.diag([4., 2., 1.])
sigma = la.svdvals(A)
r = np.linalg.matrix_rank(A)
print('A = diag(4, 2, 1)')
print(f'Singular values: {sigma}')
print()
norms = {
'Frobenius': la.norm(A, 'fro'),
'Spectral (sigma_1)':la.norm(A, 2),
'Nuclear (sum)': la.norm(A, 'nuc'),
'Schatten S_3': np.sum(sigma**3)**(1/3),
'Matrix-1': la.norm(A, 1),
'Matrix-inf': la.norm(A, np.inf),
'Stable rank rho': la.norm(A,'fro')**2 / la.norm(A,2)**2,
'Cond number kappa': la.cond(A),
}
for name, val in norms.items():
print(f' {name:25s}: {val:.6f}')
print('\nKey inequalities:')
f = la.norm(A,'fro')
s2 = la.norm(A, 2)
sn = la.norm(A,'nuc')
print(f' ||A||_2 <= ||A||_F <= sqrt(r)*||A||_2: {s2:.3f} <= {f:.3f} <= {np.sqrt(r)*s2:.3f}')
print(f' ||A||_F <= ||A||_* <= sqrt(r)*||A||_F: {f:.3f} <= {sn:.3f} <= {np.sqrt(r)*f:.3f}')
print(f' All inequalities hold: {s2<=f<=np.sqrt(r)*s2 and f<=sn<=np.sqrt(r)*f}')
Code cell 45
# === 12.2 Visualization: Singular Value Spectra and Norms ===
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Left: different spectra and their norm ratios
n_sig = 8
spectra = {
'Flat (I)': np.ones(n_sig),
'Linear decay': np.linspace(1, 0.1, n_sig),
'Exponential': np.exp(-np.arange(n_sig)*0.5),
'Power law': 1.0/(np.arange(1, n_sig+1)),
'Rank-1 spike': np.array([1.]+[0.01]*(n_sig-1)),
}
colors = ['steelblue','coral','seagreen','purple','orange']
ax = axes[0]
for (label, s), c in zip(spectra.items(), colors):
ax.plot(range(1, n_sig+1), s/s[0], 'o-', label=label, color=c)
ax.set_xlabel('Index $i$'); ax.set_ylabel('$\\sigma_i / \\sigma_1$')
ax.set_title('Normalized singular value spectra')
ax.legend(fontsize=8)
# Right: stable rank for each spectrum
rho_vals = []
labels = []
for label, s in spectra.items():
rho = np.sum(s**2) / s[0]**2
rho_vals.append(rho)
labels.append(label)
ax2 = axes[1]
bars = ax2.bar(range(len(rho_vals)), rho_vals, color=colors)
ax2.axhline(n_sig, color='black', ls='--', alpha=0.4, label=f'rank={n_sig}')
ax2.set_xticks(range(len(labels)))
ax2.set_xticklabels([l.split(' ')[0] for l in labels], rotation=20)
ax2.set_ylabel('Stable rank $\\rho$')
ax2.set_title('Stable rank by spectrum type')
ax2.legend()
plt.tight_layout()
plt.show()
print('Flat spectrum (identity-like) -> rho = rank')
print('Rank-1 spike -> rho = 1 (extreme low rank)')
Code cell 46
# === 12.3 Perturbation Visualization: Weyl vs Eigenvalue Stability ===
if HAS_MPL:
np.random.seed(42)
n = 5
eps_vals = np.logspace(-3, 0, 20)
# Symmetric matrix (well-conditioned eigenvalues)
A_sym = np.random.randn(n, n)
A_sym = A_sym + A_sym.T # symmetric
# Non-normal upper-triangular matrix (sensitive eigenvalues)
A_nn = np.triu(np.random.randn(n, n))
np.fill_diagonal(A_nn, np.arange(1, n+1, dtype=float))
sv_changes_sym = []
sv_changes_nn = []
ev_changes_sym = []
ev_changes_nn = []
n_trials = 100
for eps in eps_vals:
sv_ch_s, sv_ch_n, ev_ch_s, ev_ch_n = [], [], [], []
for _ in range(n_trials):
E = eps * np.random.randn(n, n)
sv_ch_s.append(np.max(np.abs(la.svdvals(A_sym+E) - la.svdvals(A_sym))))
sv_ch_n.append(np.max(np.abs(la.svdvals(A_nn+E) - la.svdvals(A_nn))))
# Eigenvalue changes (sorted by real part)
ev_s = np.sort(np.real(la.eigvals(A_sym+E)))
ev_s0 = np.sort(np.real(la.eigvals(A_sym)))
ev_ch_s.append(np.max(np.abs(ev_s - ev_s0)))
# Non-normal: use absolute values of eigenvalues
ev_n_pert = np.sort(np.abs(la.eigvals(A_nn+E)))
ev_n_orig = np.sort(np.abs(la.eigvals(A_nn)))
ev_ch_n.append(np.max(np.abs(ev_n_pert - ev_n_orig)))
sv_changes_sym.append(np.mean(sv_ch_s))
sv_changes_nn.append(np.mean(sv_ch_n))
ev_changes_sym.append(np.mean(ev_ch_s))
ev_changes_nn.append(np.mean(ev_ch_n))
fig, ax = plt.subplots(figsize=(9, 5))
ax.loglog(eps_vals, eps_vals, 'k--', alpha=0.5, label='Weyl bound ($=\\epsilon$)')
ax.loglog(eps_vals, sv_changes_sym, 'b-o', ms=4, label='$|\\Delta\\sigma|$ symmetric')
ax.loglog(eps_vals, sv_changes_nn, 'b--s', ms=4, label='$|\\Delta\\sigma|$ non-normal')
ax.loglog(eps_vals, ev_changes_sym, 'r-o', ms=4, label='$|\\Delta\\lambda|$ symmetric')
ax.loglog(eps_vals, ev_changes_nn, 'r--s', ms=4, label='$|\\Delta\\lambda|$ non-normal')
ax.set_xlabel('Perturbation size $\\|E\\|_2$')
ax.set_ylabel('Mean change in spectral quantities')
ax.set_title('Weyl stability: singular values vs eigenvalues')
ax.legend(fontsize=9)
plt.tight_layout()
plt.show()
print('Singular values (both types): follow Weyl bound closely')
print('Eigenvalues of non-normal: can exceed Weyl bound significantly')
13. Practical Recipe: Which Norm to Use
| Goal | Norm | Why |
|---|---|---|
| Regularize weights | Frobenius () | Gradient is ; universal in optimizers |
| GAN Lipschitz | Spectral () | Product bounds network Lipschitz |
| Low-rank compression | Nuclear () | Convex rank proxy; SVT optimizer |
| Numerical stability | Condition | Quantifies precision loss |
| Generalization | Stable rank | PAC-Bayes bounds |
| Best rank-k approx | Either F or | Eckart-Young: SVD truncation is optimal |
Code cell 48
# === 13. Quick-access functions for all norms ===
import numpy as np
import numpy.linalg as la
def norm_report(A, name='A'):
"""Print a complete norm report for matrix A."""
s = la.svdvals(A)
r = np.linalg.matrix_rank(A)
print(f'=== Norm Report: {name} ({A.shape}, rank={r}) ===')
print(f' Frobenius ||A||_F = {la.norm(A, "fro"):.4f}')
print(f' Spectral ||A||_2 = {la.norm(A, 2):.4f} (= sigma_1={s[0]:.4f})')
print(f' Nuclear ||A||_* = {la.norm(A, "nuc"):.4f} (= sum(sigma))')
print(f' Matrix-1 ||A||_1 = {la.norm(A, 1):.4f}')
print(f' Matrix-inf ||A||_inf= {la.norm(A, np.inf):.4f}')
if min(A.shape) > 1 and r == min(A.shape):
print(f' Cond num kappa = {la.cond(A):.4f}')
print(f' Stable rank rho = {la.norm(A,"fro")**2/la.norm(A,2)**2:.4f}')
print()
np.random.seed(42)
norm_report(np.eye(4), 'I_4')
norm_report(np.random.randn(5, 4), 'Random 5x4')
norm_report(np.diag([10., 1., 0.01]), 'diag(10, 1, 0.01)')
print('PASS - All norms computed successfully')
Code cell 49
# === 13.2 Power Iteration Convergence Rate ===
np.random.seed(10)
A = np.random.randn(10, 8)
true_s = la.svdvals(A)
ratio = true_s[1] / true_s[0]
v = np.random.randn(8); v /= la.norm(v)
estimates = []
for k in range(30):
u = A @ v; u /= la.norm(u)
v_new = A.T @ u
sigma_est = la.norm(v_new)
v = v_new / sigma_est
estimates.append(abs(sigma_est - true_s[0]))
print(f'True sigma_1 = {true_s[0]:.6f}')
print(f'sigma_2/sigma_1 = {ratio:.4f} (convergence rate per iter = {ratio**2:.4f})')
print()
print('Iteration | Error | Predicted')
for k in [0, 1, 2, 4, 8, 15, 29]:
pred = abs(estimates[0]) * (ratio**2)**k
print(f' k={k:3d} | {estimates[k]:.2e} | {pred:.2e}')
print(f'\nPASS - error decreases at predicted rate: {estimates[29] < estimates[0] * (ratio**2)**29 * 10}')
Code cell 50
# === 13.3 Tikhonov via SVD (numerically stable) ===
def tikhonov_svd(A, b, lam):
"""Tikhonov solution via SVD: numerically stable even for rank-deficient A."""
U, s, Vt = la.svd(A, full_matrices=False)
d = s / (s**2 + lam) # modified singular value filter
return Vt.T @ (d * (U.T @ b))
np.random.seed(0)
m, n = 15, 10
# Near rank-deficient A
U, _, Vt = la.svd(np.random.randn(m, n))
s_true = np.exp(-np.arange(n) * 0.8) # exponentially decaying
A = U[:, :n] @ np.diag(s_true) @ Vt
x_true = np.ones(n)
b = A @ x_true + 0.01 * np.random.randn(m)
print(f'kappa(A) = {la.cond(A):.2e}')
print(f'\n lambda cond(ATA+lI) rel_err')
for lam in [0, 1e-4, 1e-2, 1.0]:
x_hat = tikhonov_svd(A, b, lam) if lam > 0 else la.lstsq(A, b, rcond=None)[0]
AtA_reg = A.T @ A + lam * np.eye(n)
kappa_reg = la.cond(AtA_reg)
err = la.norm(x_hat - x_true) / la.norm(x_true)
print(f' {lam:.0e} {kappa_reg:.2e} {err:.4f}')
print('\nSVD-based Tikhonov: stable even when lambda=0 gives kappa -> inf')
14. Closing: Matrix Norms and AI
Every major neural network training technique has a matrix norm interpretation:
- Weight decay: → gradient shrinks all singular values
- Spectral normalization: → Lipschitz-1 layers
- LoRA: → implicit nuclear norm regularization
- Gradient clipping:
if ||g||>tau: g *= tau/||g||→ controls spectral/Frobenius norm of gradient - Attention: → spectral normalization of logit matrix
- SVD compression: truncate at rank → Eckart-Young optimal approximation
- MLA: low-rank KV cache → nuclear-norm motivated compression
The mathematical tools — Weyl's inequality, Eckart-Young, proximal operators, dual norms — are the precise language in which these techniques are analyzed and understood.